def bake_job(sh_script_path, rle_branchNames, treeName, py_script_path, sh_args, py_args, logfile, submit_job=True): '''Bakes a sbatch job for us Args: sh_script_path: string, Path to the bash script rle_branchNames: dict { string : string }, Branch names of run, lumi and event treeName: string, Name of the TTree py_script_path: string, Path to the python script sh_args: string, Path to the scratch directory py_args: zip, Arguments to python script logfile, string, Path to sbatch log file submit_job: bool, If True, submit the jobs (default); if False, create the scripts only Returns: None ''' def chmod_x(f): st = os.stat(f) os.chmod(f, st.st_mode | stat.S_IEXEC) with open(sh_script_path, 'w') as sh_script: sh_script.write( jinja2.Template(dumpSh).render( py_script=py_script_path, scratch_dir=sh_args, )) chmod_x(sh_script_path) with open(py_script_path, 'w') as py_script: py_script.write( jinja2.Template(dumPy).render( input_list=py_args, run=rle_branchNames['run'], lumi=rle_branchNames['lumi'], event=rle_branchNames['event'], tree=treeName, )) chmod_x(py_script_path) if submit_job: submit_cmd = "sbatch --mem=1800M --partition=short --output={logfile} {bash_script}".format( logfile=logfile, bash_script=sh_script_path, ) submit = subprocess.Popen( submit_cmd, stdout=subprocess.PIPE, stderr=subprocess.PIPE, shell=True, ) submit_out, submit_err = submit.communicate() logging.debug("\n%s\n%s" % (submit_out, submit_err)) logging.debug("Submitted the job") return int(submit_out.split()[-1])
def skim(in_filename, out_filename, entry_list, tree_name = "Events"): ''' Args: in_filename: string, Path to the input ROOT file out_filename: string, Path to the output ROOT file entry_list: int array, List of entries (not RLE numbers!) which are used to select the event tree_name: string, TTree name (default: tree) Returns: True, if none of the entry numbers didn't exceed the actual number of entries in the ROOT file False, otherwise ''' logging.debug("Processing file {in_filename}".format(in_filename = in_filename)) f_in = ROOT.TFile(in_filename, "read") t_in = f_in.Get(tree_name) nof_entries = t_in.GetEntries() if max(entry_list) > nof_entries: logging.error("Max entry ID exceeds the number of entries in {root_filename}: {max_entry} > {nof_entries}".format( root_filename = in_filename, max_entry = max(entry_list), nof_entries = nof_entries, )) return False f_out = ROOT.TFile(out_filename, "recreate") t_out = t_in.CloneTree(0) for i in entry_list: t_in.GetEntry(i) t_out.Fill() t_out.AutoSave() logging.debug("Saved events to {out_filename}".format(out_filename = out_filename)) return True
def save_graph(gen_parts, output_file_name, keep_tmp=False): graph_nodes = [] graph_edges = [] for gen_part in gen_parts: graph_nodes.append([ gen_part.idx, gen_part.pdgId, gen_part.status, gen_part.pt, gen_part.eta, gen_part.phi, gen_part.mass ]) if gen_part.momIdx >= 0: graph_edges.append([gen_part.momIdx, gen_part.idx]) output_file_name_dot = output_file_name.replace('.png', '.dot') with open(output_file_name_dot, "w") as dot_file: dot_file.write( jinja2.Template(GRAPH_TEMPLATE).render(nodes=graph_nodes, edges=graph_edges)) output_file_name_eps = output_file_name.replace('.png', '.eps') subprocess.call("dot -Teps {} > {}".format(output_file_name_dot, output_file_name_eps), shell=True) subprocess.call("convert -flatten -density 150 {} {}".format( output_file_name_eps, output_file_name), shell=True) logging.debug("Created file {}".format(output_file_name)) if not keep_tmp: os.remove(output_file_name_dot) os.remove(output_file_name_eps)
def submit(self, cmd_str): nof_max_retries = 10 current_retry = 0 while current_retry < nof_max_retries: # Run command cmd_outerr = run_cmd(cmd_str, return_stderr=True) try: job_id = cmd_outerr[0].split()[-1] break except IndexError: # Fails if stdout returned by the last line is empty logging.warning( "Caught an error: '%s'; resubmitting %i-th time" % (cmd_outerr[1], current_retry)) current_retry += 1 logging.debug("sleeping for %i seconds." % 60) time.sleep( 60 ) # Let's wait for 60 seconds until the next resubmission # The job ID must be a number, so.. we have to check if it really is one try: int(job_id) except ValueError: raise ValueError("job_id = '%s' NaN; sbatch stdout = '%s'; sbatch stderr = '%s'" % \ (job_id, cmd_outerr[0], cmd_outerr[1])) if job_id in self.submittedJobs: raise RuntimeError("Same job ID: %s" % job_id) # Is a valid job ID return job_id
def dump_rle(input_file, output_file, tree_name='Events', run_br='run', lumi_br='luminosityBlock', event_br='event'): with open(output_file, 'w') as f: ch_root = ROOT.TChain(tree_name) ch_root.AddFile(input_file) run_a = array.array('I', [0]) lumi_a = array.array('I', [0]) evt_a = array.array('L', [0]) ch_root.SetBranchAddress(run_br, run_a) ch_root.SetBranchAddress(lumi_br, lumi_a) ch_root.SetBranchAddress(event_br, evt_a) nof_entries = ch_root.GetEntries() rle_i_arr = [] for i in range(nof_entries): ch_root.GetEntry(i) rle_i_arr.append(':'.join(map(str, [run_a[0], lumi_a[0], evt_a[0]]))) f.write("{rle_lines}\n".format(rle_lines='\n'.join(rle_i_arr))) logging.debug("Wrote {nof_bytes} kB to {filename}".format( nof_bytes=os.path.getsize(output_file) / 1000, filename=output_file, )) return
def hlt_version(hlt_path): if not HLT_SUFFIX.match(hlt_path): new_hlt_path = '{}_v*'.format(hlt_path) logging.debug("Changing {} to {} in brilcalc queries".format( hlt_path, new_hlt_path)) return new_hlt_path else: return hlt_path
def get_paths(input_paths, whitelist, blacklist): valid_paths = {} for input_path in input_paths: input_path_split = [ subpath for subpath in input_path.split(os.path.sep) if subpath != '' ] nof_levels = len(input_path_split) if nof_levels == 6: input_path_subdir = os.path.join(input_path, OUTPUT_RLE) if not hdfs.isdir(input_path_subdir): raise ValueError("No such directory: %s" % input_path_subdir) for channel_dir in sorted(hdfs.listdir(input_path_subdir)): channel_name = os.path.basename(channel_dir) if whitelist and channel_name not in whitelist: logging.info("Excluding channel: {}".format(channel_name)) continue if channel_name in blacklist: logging.info("Excluding channel: {}".format(channel_name)) continue if channel_name in valid_paths: raise ValueError( "Found duplicate paths for the same channel: %s and %s" % (valid_paths[channel_name], input_path)) logging.debug('Found channel {} at path {}'.format( channel_name, channel_dir)) valid_paths[channel_name] = channel_dir elif nof_levels == 8: if input_path_split[-2] != OUTPUT_RLE: raise ValueError("Invalid path: %s" % input_path) channel_name = input_path_split[-1] if whitelist and channel_name not in whitelist: raise ValueError("Path %s conflicting with whitelist: %s" % (input_path, ', '.join(whitelist))) if channel_name in blacklist: raise ValueError("Path %s conflicting with blacklist: %s" % (input_path, ', '.join(blacklist))) if channel_name in valid_paths: raise ValueError( "Found duplicate paths for the same channel: %s and %s" % (valid_paths[channel_name], input_path)) logging.debug('Found channel {} at path {}'.format( channel_name, input_path)) valid_paths[channel_name] = input_path else: raise ValueError("Invalid path: %s" % input_path) assert (len(set(valid_paths.values())) == len(valid_paths)) return valid_paths
def cleanup(dir_name, exit = False): '''Removes a directory and might exit hard from the script Args: dir_name: string, Path to directory which is to be removed exit: bool, If True, exits the script via sys.exit() (default: False) Returns: None ''' logging.debug("Removing directory {dir_name}".format(dir_name = dir_name)) try: shutil.rmtree(dir_name) except IOError: logging.error("Managed to get an error while removing directory {dir_name}".format(dir_name = dir_name)) if exit: sys.exit(1)
def create_dir_if_not_exist(d): '''Creates the directory if it doesn't exist Args: d: string, the diretory to be created Returns: True, if the directory was created False, otherwise ''' if not os.path.isdir(d): logging.debug( "Directory '{dir_name}' doesn't exist, attempting to create it". format(dir_name=d, )) try: os.makedirs(d) except IOError: logging.error("Could not create the directory") return False return True
def check_dir(dirname, use_force): if not os.path.isdir(dirname): if not use_force: logging.error("Directory '{output_dir}' does not exist".format( output_dir=dirname, )) return False else: logging.debug( "Creating directory '{output_dir}' since it's missing".format( output_dir=dirname, )) try: os.makedirs(dirname) except IOError as err: logging.error( "Caught an error while creating directory '{output_dir}': {reason}" .format( output_dir=dirname, reason=err, )) return False return True
def run_cmd(command, do_not_log=False, stdout_file=None, stderr_file=None, return_stderr=False): """Runs given commands and logs stdout and stderr to files """ p = subprocess.Popen(command, shell=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE) stdout, stderr = p.communicate() # Remove trailing newline stdout = stdout.rstrip('\n') stderr = stderr.rstrip('\n') if stdout_file: stdout_file.write(command + "\n") stdout_file.write('%s\n' % stdout) if stderr_file: stderr_file.write('%s\n' % stderr) if not do_not_log: logging.debug("Executed command: '%s'" % command) logging.debug("stdout: '%s'" % stdout) logging.debug("stderr: '%s'" % stderr) if return_stderr: return stdout, stderr return stdout
def skim_debug(out_filename, rle_list, tree_name = "tree"): '''Checks if skimming was successful by comparing RLE number in the output file to the given list of RLE numbers Args: out_filename: string, Path to the file the RLE numbers of which is compared against the RLE array rle_list: string array, List of RLE numbers as strings tree_name: string, TTree name (default: tree) Returns: True, if the RLE numbers in the file match exactly to the given input list of RLE numbers False, otherwise ''' logging.debug("Checking if {out_filename} contains exactly the same events as provided by the RLE file".format( out_filename = out_filename, )) if not hdfs.isfile(out_filename): return False out_rle_list = get_rle(out_filename, tree_name) missing_from_file = list(set(rle_list) - set(out_rle_list)) excess_in_file = list(set(out_rle_list) - set(rle_list)) ret_val = True if missing_from_file: logging.error("There are {nof_missing} events missing from {out_filename}: {list_of_missing_events}".format( nof_missing = len(missing_from_file), out_filename = out_filename, list_of_missing_events = ', '.join(missing_from_file), )) ret_val = False if excess_in_file: logging.error("There are {nof_excess} event in excess in the file {out_filename}: {list_of_excess_events}".format( nof_excess = len(excess_in_file), out_filename = out_filename, list_of_excess_events = ', '.join(excess_in_file), )) ret_val = False return ret_val
def __init__(self): self.lib_path = "/usr/lib64/libhdfs.so" if not os.path.isfile(self.lib_path): raise hdfsException("No such file: %s" % self.lib_path) logging.debug("Loading {lib}".format(lib = self.lib_path)) self.lib = ctypes.cdll.LoadLibrary(self.lib_path) self.lib.hdfsListDirectory.restype = ctypes.POINTER(hdfs.hdfsFileInfo) self.lib.hdfsGetPathInfo.restype = ctypes.POINTER(hdfs.hdfsFileInfo) self.hdfsFileInfo_size = ctypes.sizeof(hdfs.hdfsFileInfo) logging.debug("Building HDFS interface") self.bld = self.lib.hdfsNewBuilder() if not self.bld: raise hdfsException("Could not create new HDFS interface") self.lib.hdfsBuilderSetNameNode(self.bld, "default") self.lib.hdfsBuilderSetNameNodePort(self.bld, 0) logging.debug("Connecting to the HDFS interface") self.fs = self.lib.hdfsBuilderConnect(self.bld) if not self.fs: raise hdfsException("Could not connect to the HDFS interface")
if args.input: input_file = args.input if not os.path.isfile(input_file): logging.error("No such file: {input_filename}".format( input_filename=input_file)) sys.exit(1) output_file = output parent_dir = os.path.dirname(os.path.abspath(output_file)) if not check_dir(parent_dir, use_force): sys.exit(1) logging.debug( "Saving RLE numbers from {input_file} to {output_file}".format( input_file=input_file, output_file=output_file, )) dump_rle(input_file, output_file, args.tree, args.run, args.lumi, args.event) else: samples = load_samples(args.era, is_postproc=args.post_processed) output_dir = output if not check_dir(output_dir, use_force): sys.exit(1) idx = lambda x: int(x[x.rfind('_') + 1:x.rfind('.')])
def get_graph(input_file_name, rles, mtable): logging.debug('Opening file {}'.format(input_file_name)) input_file = ROOT.TFile.Open(input_file_name, 'read') assert (input_file) input_tree = input_file.Get('Events') assert (input_tree) run_branch = array.array('I', [0]) luminosityBlock_branch = array.array('I', [0]) event_branch = array.array('L', [0]) input_tree.SetBranchAddress('run', run_branch) input_tree.SetBranchAddress('luminosityBlock', luminosityBlock_branch) input_tree.SetBranchAddress('event', event_branch) genPartCollection = GenPartCollection(input_tree, mtable) graph_map = {} nof_events = input_tree.GetEntries() logging.debug('Found {} events in the file'.format(nof_events)) for event_idx in range(nof_events): input_tree.GetEntry(event_idx) rle = ':'.join( map(lambda branch: str(branch[0]), [run_branch, luminosityBlock_branch, event_branch])) if rle not in rles: continue logging.debug('Found event {} at index {}'.format(rle, event_idx)) gen_parts = genPartCollection.read() logging.debug('Found {} generator level particles in the event'.format( len(gen_parts))) for gen_part in gen_parts: logging.debug(gen_part) graph_map[rle] = gen_parts if all(requested_rle in graph_map for requested_rle in rles): break input_file.Close() logging.debug('Found {} matches in file {}'.format(len(graph_map), input_file_name)) return graph_map
def submitJob( self, inputFiles, executable, command_line_parameter, outputFilePath, outputFiles, scriptFile, logFile=None, skipIfOutputFileExists=False, job_template_file='sbatch-node.sh.template', copy_output_file=True, nof_submissions=0, ): """Waits for all sbatch jobs submitted by this instance of sbatchManager to finish processing """ logging.debug("<sbatchManager::submitJob>: job_template_file = '%s'" % job_template_file) job_template_file = os.path.join(jinja_template_dir, job_template_file) job_template = open(job_template_file, 'r').read() # raise if logfile missing if not logFile: if not self.logFileDir: raise ValueError( "Please call 'setLogFileDir' before calling 'submitJob' !!" ) logFile = os.path.join( self.logFileDir, os.path.basename(scriptFile).replace(".sh", ".log")) # skip only if none of the output files are missing in the file system outputFiles_fullpath = map( lambda outputFile: os.path.join(outputFilePath, outputFile), outputFiles) if skipIfOutputFileExists: outputFiles_missing = [ outputFile for outputFile in outputFiles_fullpath \ if not is_file_ok(outputFile, validate_outputs = True, min_file_size = self.min_file_size) ] if not outputFiles_missing: logging.debug( "output file(s) = %s exist(s) --> skipping !!" % \ '; '.join(map(lambda x: "'%s'" % x, outputFiles_fullpath)) ) return if not self.workingDir: raise ValueError( "Please call 'setWorkingDir' before calling 'submitJob' !!") if not self.cmssw_base_dir: logging.warning("cmssw_base_dir not set, setting it to '%s'" % os.environ.get('CMSSW_BASE')) self.cmssw_base_dir = os.environ.get('CMSSW_BASE') job_dir = self.get_job_dir() # create script for executing jobs wrapper_log_file = logFile.replace('.log', '_wrapper.log') executable_log_file = logFile.replace('.log', '_executable.log') wrapper_log_file, executable_log_file = get_log_version( (wrapper_log_file, executable_log_file)) sbatch_command = "sbatch --partition={partition} --output={output} --comment='{comment}' " \ "{max_mem} {args} {cmd}".format( partition = self.queue, output = wrapper_log_file, comment = self.pool_id, args = self.sbatchArgs, cmd = scriptFile, max_mem = '--mem={}'.format(self.max_mem) if self.max_mem else '', ) two_pow_sixteen = 65536 random.seed((abs(hash(command_line_parameter))) % two_pow_sixteen) max_delay = 60 random_delay = random.randint(0, max_delay) script = jinja2.Template(job_template).render( working_dir=self.workingDir, cmssw_base_dir=self.cmssw_base_dir, job_dir=job_dir, job_template_file=job_template_file, exec_name=executable, command_line_parameter=command_line_parameter, inputFiles=" ".join(inputFiles), outputDir=outputFilePath, outputFiles=" ".join(outputFiles), wrapper_log_file=wrapper_log_file, executable_log_file=executable_log_file, script_file=scriptFile, RUNNING_COMMAND=sbatch_command, random_sleep=random_delay, copy_output_file=copy_output_file, ) logging.debug("writing sbatch script file = '%s'" % scriptFile) with codecs.open(scriptFile, "w", "utf-8") as f: f.write(script) f.flush() os.fsync(f.fileno()) if self.dry_run: return nof_submissions += 1 job = { 'sbatch_command': sbatch_command, 'status': Status.in_queue, 'log_wrap': wrapper_log_file, 'log_exec': executable_log_file, 'args': ( inputFiles, executable, command_line_parameter, outputFilePath, outputFiles, scriptFile, logFile, skipIfOutputFileExists, job_template_file, nof_submissions, ), 'nof_submissions': nof_submissions, 'outputFiles': outputFiles_fullpath, } self.queuedJobs.append(job)
def poll(self, nonBlocking): """Waits for all sbatch jobs submitted by this instance of sbatchManager to finish processing """ text_line = '-' * 120 # Set a delimiter, which distinguishes entries b/w different jobs delimiter = ',' # Explanation (the maximum pool ID length = 256 is configurable via self.max_pool_id_length): # 1) squeue -h -u {{user}} -o '%i %256k' # Collects the list of running jobs # a) -h omits header # b) -u {{user}} looks only for jobs submitted by {{user}} # c) -o '%i %256k' specifies the output format # i) %i -- job ID (1st column) # ii) %256k -- comment with width of 256 characters (2nd column) # If the job has no comments, the entry simply reads (null) # 2) grep {{comment}} # Filter the jobs by the comment which must be unique per sbatchManager instance at all times # 3) awk '{print $1}' # Filter only the jobIds out # 4) sed ':a;N;$!ba;s/\\n/{{delimiter}}/g' # Place all job IDs to one line, delimited by {{delimiter}} (otherwise the logs are hard to read) command_template = "squeue -h -u {{user}} -o '%i %{{ pool_id_length }}k' | grep {{comment}} | awk '{print $1}' | " \ "sed ':a;N;$!ba;s/\\n/{{delimiter}}/g'" command = jinja2.Template(command_template).render( user=self.user, pool_id_length=self.max_pool_id_length, comment=self.pool_id, delimiter=delimiter) # Initially, all jobs are marked as submitted so we have to go through all jobs and check their exit codes # even if some of them have already finished jobIds_set = set([ job_id for job_id in self.submittedJobs if self.submittedJobs[job_id]['status'] == Status.submitted ]) nofJobs_left = len(jobIds_set) + len(self.queuedJobs) while nofJobs_left > 0: # Get the list of jobs submitted to batch system and convert their jobIds to a set poll_result, poll_result_err = '', '' while True: poll_result, poll_result_err = run_cmd(command, do_not_log=False, return_stderr=True) if not poll_result and poll_result_err: logging.warning( 'squeue caught an error: {squeue_error}'.format( squeue_error=poll_result_err)) else: break # sleep a minute and then try again # in principle we could limit the number of retries, but hopefully that's not necessary logging.debug("sleeping for %i seconds." % 60) time.sleep(60) polled_ids = set() if poll_result != '': polled_ids = set(poll_result.split(delimiter)) # Check if number of jobs submitted to batch system is below maxSubmittedJobs; # if it is, take jobs from queuedJobs list and submit them, # until a total of maxSubmittedJobs is submitted to batch system nofJobs_toSubmit = min(len(self.queuedJobs), self.maxSubmittedJobs - len(polled_ids)) if nofJobs_toSubmit > 0: logging.debug( "Jobs: submitted = {}, in queue = {} --> submitting the next {} jobs." .format(len(polled_ids), len(self.queuedJobs), nofJobs_toSubmit)) else: logging.debug( "Jobs: submitted = {}, in queue = {} --> waiting for submitted jobs to finish processing." .format(len(polled_ids), len(self.queuedJobs))) for i in range(0, nofJobs_toSubmit): # randomly submit a job from the queue two_pow_sixteen = 65536 random.seed((abs(hash(uuid.uuid4()))) % two_pow_sixteen) max_idx = len(self.queuedJobs) - 1 random_idx = random.randint(0, max_idx) job = self.queuedJobs.pop(random_idx) job['status'] = Status.submitted job_id = self.submit(job['sbatch_command']) self.submittedJobs[job_id] = job # Now check status of jobs submitted to batch system: # Subtract the list of running jobs from the list of all submitted jobs -- the result is a list of # jobs that have finished already finished_ids = list(jobIds_set - polled_ids) # Do not poll anything if currently there are no finished jobs if finished_ids: # Based on job's exit code what if the job has failed or completed successfully # However, the sacct/scontrol commands yield too much output if too many jobs have been submitted here # Therefore, we want to restrict the output by grepping specific job IDs # There's another problem with that: the length of a bash command is limited by ARG_MAX kernel variable, # which is of order 2e6 # This means that we have to split the job IDs into chunks each of which we have to check separately finished_ids_chunks = [ finished_ids[i:i + self.max_nof_greps] for i in range(0, len(finished_ids), self.max_nof_greps) ] for finished_ids_chunk in finished_ids_chunks: completion = self.check_job_completion(finished_ids_chunk) completed_jobs, running_jobs, failed_jobs = [], [], [] for job_id, details in completion.iteritems(): if details.status == Status.completed: completed_jobs.append(job_id) elif details.status == Status.running: running_jobs.append(job_id) else: failed_jobs.append(job_id) # If there are any failed jobs, throw if failed_jobs: failed_jobs_str = ','.join(failed_jobs) errors = [ completion[job_id].status for job_id in failed_jobs ] logging.error( "Job(s) w/ ID(s) {jobIds} finished with errors: {reasons}" .format( jobIds=failed_jobs_str, reasons=', '.join(map(Status.toString, errors)), )) # Let's print a table where the first column corresponds to the job ID # and the second column lists the exit code, the derived exit code, the status # and the classification of the failed job logging.error("Error table:") for job_id in failed_jobs: sys.stderr.write( "{jobId} {exitCode} {derivedExitCode} {state} {status}\n" .format( jobId=job_id, exitCode=completion[job_id].exit_code, derivedExitCode=completion[job_id]. derived_exit_code, state=completion[job_id].state, status=Status.toString( completion[job_id].status), )) sys.stderr.write('%s\n' % text_line) for failed_job in failed_jobs: for log in zip(['wrapper', 'executable'], ['log_wrap', 'log_exec']): logfile = self.submittedJobs[failed_job][ log[1]] if os.path.isfile(logfile): logfile_contents = open(logfile, 'r').read() else: logfile_contents = '<file is missing>' sys.stderr.write( 'Job ID {id} {description} log ({path}):\n{line}\n{log}\n{line}\n' .format( id=failed_job, description=log[0], path=logfile, log=logfile_contents, line=text_line, )) if self.submittedJobs[failed_job]['nof_submissions'] < self.max_resubmissions and \ completion[failed_job].status == Status.io_error: # The job is eligible for resubmission if the job hasn't been resubmitted more # than a preset limit of resubmissions AND if the job failed due to I/O errors logging.warning( "Job w/ ID {id} and arguments {args} FAILED because: {reason} " "-> resubmission attempt #{attempt}". format( id=failed_job, args=self.submittedJobs[failed_job] ['args'], reason=Status.toString( completion[failed_job].status), attempt=self.submittedJobs[failed_job] ['nof_submissions'], )) self.submitJob( *self.submittedJobs[failed_job]['args']) # The old ID must be deleted, b/c otherwise it would be used to compare against # squeue output and we would resubmit the failed job ad infinitum del self.submittedJobs[failed_job] else: # We've exceeded the maximum number of resubmissions -> fail the workflow raise Status.raiseError( completion[failed_job].status) else: logging.debug( "Job(s) w/ ID(s) {completedIds} finished successfully {runningInfo}" .format( completedIds=','.join(completed_jobs), runningInfo='(%s still running)' % ','.join(running_jobs) if running_jobs else '', )) # Mark successfully finished jobs as completed so that won't request their status code again # Otherwise they will be still at ,,submitted'' state for job_id in completed_jobs: if not all( map( lambda outputFile: is_file_ok( outputFile, validate_outputs=True, min_file_size=self.min_file_size), self .submittedJobs[job_id]['outputFiles'])): if self.submittedJobs[job_id][ 'nof_submissions'] < self.max_resubmissions: logging.warning( "Job w/ ID {id} and arguments {args} FAILED to produce a valid output file " "-> resubmission attempt #{attempt}". format( id=job_id, args=self.submittedJobs[job_id] ['args'], attempt=self.submittedJobs[job_id] ['nof_submissions'], )) self.submitJob( *self.submittedJobs[job_id]['args']) del self.submittedJobs[job_id] else: raise ValueError( "Job w/ ID {id} FAILED because it repeatedly produces bogus output " "file {output} yet the job still exits w/o any errors" .format( id=job_id, output=', '.join( self.submittedJobs[job_id] ['outputFiles']), )) else: # Job completed just fine self.submittedJobs[job_id][ 'status'] = Status.completed jobIds_set = set([ job_id for job_id in self.submittedJobs if self.submittedJobs[job_id]['status'] == Status.submitted ]) nofJobs_left = len(jobIds_set) + len(self.queuedJobs) logging.info( "Waiting for sbatch to finish (%d job(s) still left) ..." % nofJobs_left) if nofJobs_left > 0: if nonBlocking: return False two_pow_sixteen = 65536 random.seed((abs(hash(uuid.uuid4()))) % two_pow_sixteen) max_delay = 300 random_delay = random.randint(0, max_delay) logging.debug("sleeping for %i seconds." % random_delay) time.sleep(self.poll_interval + random_delay) else: break return True
if grep_directory and not hdfs.isdir(grep_directory): logging.error("Grep directory '{grep_directory}' doesn't exist".format( grep_directory=grep_directory, )) sys.exit(1) sample_keys = {} for s_key, s_value in samples.iteritems(): if sample_name_re.match(s_value['process_name_specific']): sample_keys[s_key] = s_value['process_name_specific'] if not sample_keys: logging.error("Invalid sample name: '{sample_name}'".format( sample_name=sample_name)) sys.exit(1) logging.debug("Got the following matches: {matches}".format( matches=', '.join(sample_keys.keys()))) # read the RLE numbers and form a dictionary { RLE number : root file it contains it } rle_pattern = re.compile('\d+:\d+:\d+') rles = {} with open(rle_file, 'r') as f: for line in f: line = line.rstrip('\n') if not line: continue rle_match = rle_pattern.match(line) if not rle_match: logging.error( "Line '{unmatched_line}' doesn't look like a RLE number". format(unmatched_line=line, )) sys.exit(1)
) if not os.path.isdir(args.output_dir): if not args.force: raise ValueError('Use -f/--force to create output dir %s' % args.output_dir) else: os.path.mkdirs(args.output_dir) # Let's get the list of DY samples dy_samples = { dbs_name: [] for dbs_name in samples if dbs_name.startswith('/DY') } for dy_sample in dy_samples: logging.debug('Found sample: {}'.format(dy_sample)) # Get files for dy_sample in dy_samples: query = "dasgoclient -query='file dataset={} | grep file.name | grep file.nevents'".format( dy_sample) stdout, stderr = run_cmd(query, do_not_log=True, return_stderr=True) if not stdout or stderr: raise RuntimeError("Unsuccessful DBS query '%s': %s" % (query, stderr)) files = map(lambda y: (y[0], int(y[1])), map(lambda x: x.split(), stdout.rstrip('\n').split('\n'))) selected_files = [] for file_cand in files: if args.min_event > 0 and file_cand[1] < args.min_event:
def dump_rle_parallel(output_dir, rle_branchNames, treeName, nof_files=100, force=False, test=False, verbose=False, sample='', tmp_dir=''): '''Dumps RLE numbers ,,in parallel'' Args: output_dir: string, Path to the directory where the RLE files will be stored rle_branchNames: dict { string : string }, Specifies the run, lumi and event branch names treeName: string, Name of the TTree nof_files: int, Number of files to be processed by one sbatch jobs force: bool, If True, creates `output_dir` if it's not there test: bool, If True, create jobs scripts but do not submit them to SLURM verbose: bool, If True, prints lots of information to standard output sample: string, (optional) sample name; if the sample name is not specified, all samples will be processed Returns: int array, List of sbatch job IDs that were submitted to SLURM This list can be used in checking if the jobs that were submitted in this routine are finished or not The method does the following things: 1) loops over sample entries in 2016 dictionary (default) or selects only one sample (specified by `sample`) 2) loops over all root files under sample directory and arranges them into chunks specified by `nof_files` 3) creates a Python script and a Bash script which loops over the entries in the file 4) submits each job to SLURM, unless `test` is True 5) returns a list of sbatch job IDs that were assigned to each job ''' if verbose: logging.getLogger().setLevel(logging.DEBUG) if not os.path.isdir(output_dir): if not force: logging.error("Directory '{output_dir}' does not exist".format( output_dir=output_dir, )) sys.exit(1) else: logging.debug( "Creating directory '{output_dir}' since it's missing".format( output_dir=output_dir, )) # let's make a temporary directories output_dir_tmp = os.path.join(output_dir, "tmp") if not tmp_dir else tmp_dir if not create_dir_if_not_exist(output_dir_tmp): sys.exit(1) output_dir_tmp_sh = os.path.join(output_dir_tmp, "sh") output_dir_tmp_py = os.path.join(output_dir_tmp, "py") output_dir_tmp_log = os.path.join(output_dir_tmp, "log") if not create_dir_if_not_exist(output_dir_tmp_sh): sys.exit(1) if not create_dir_if_not_exist(output_dir_tmp_py): sys.exit(1) if not create_dir_if_not_exist(output_dir_tmp_log): sys.exit(1) scratch_dir = "/scratch/{user_name}/dump_rle".format( user_name=getpass.getuser()) idx = lambda x: int(x[x.rfind('_') + 1:x.rfind('.')]) tree_pattern = re.compile("tree_\d+.root") jobId = 0 root_files, remote_output, local_output = [], [], [] found_sample_name = False sbatch_job_ids = [] for s_key, s_value in samples.iteritems(): sample_name = s_value['process_name_specific'] if sample and sample_name != sample: continue found_sample_name = True sample_path = s_value['local_paths'][0]['path'] logging.debug("Processing sample '{sample_name}'".format( sample_name=sample_name, )) output_dir_parent = os.path.join(output_dir, sample_name) if not os.path.isdir(output_dir_parent): os.makedirs(output_dir_parent) for sample_subdir_basename in os.listdir(sample_path): sample_subdir = os.path.join(sample_path, sample_subdir_basename) for rootfile_basename in os.listdir(sample_subdir): tree_match = tree_pattern.match(rootfile_basename) if not tree_match: continue rootfile_idx = idx(rootfile_basename) root_files.append( os.path.join(sample_subdir, rootfile_basename)) local_output.append( os.path.join(output_dir_parent, "{i}.txt".format(i=rootfile_idx))) remote_output.append( os.path.join(scratch_dir, str(jobId), sample_name, os.path.basename(local_output[-1]))) if len(root_files) == nof_files: sh_path = os.path.join(output_dir_tmp_sh, "{i}.sh".format(i=jobId)) py_path = os.path.join(output_dir_tmp_py, "{i}.py".format(i=jobId)) log_path = os.path.join(output_dir_tmp_log, "{i}.log".format(i=jobId)) scratch_job_dir = os.path.join( os.path.join(scratch_dir, str(jobId))) sbatch_job_id = bake_job( sh_path, rle_branchNames, treeName, py_path, scratch_job_dir, zip(root_files, remote_output, local_output), log_path, not test, ) if sbatch_job_id: sbatch_job_ids.append(sbatch_job_id) logging.debug("Creating job {jobId}".format(jobId=jobId)) root_files, remote_output, local_output = [], [], [] jobId += 1 if sample and not found_sample_name: logging.error( "Sample name '{sample_name}' does not exist in the sample dictionary" .format(sample_name=sample)) sys.exit(1) if root_files: sh_path = os.path.join(output_dir_tmp_sh, "{i}.sh".format(i=jobId)) py_path = os.path.join(output_dir_tmp_py, "{i}.py".format(i=jobId)) log_path = os.path.join(output_dir_tmp_log, "{i}.log".format(i=jobId)) scratch_job_dir = os.path.join(os.path.join(scratch_dir, str(jobId))) sbatch_job_id = bake_job( sh_path, rle_branchNames, treeName, py_path, scratch_job_dir, zip(root_files, remote_output, local_output), log_path, not test, ) if sbatch_job_id: sbatch_job_ids.append(sbatch_job_id) logging.debug("Creating job {jobId}".format(jobId=jobId)) logging.debug("Done!") return map(int, sbatch_job_ids)
def validate(output_dir, verbose=False): '''Validates the job execution carried out by dump_rle_parallel() Args: output_dir: string, The directory where all RLE files are stored verbose: bool, Enable verbose output Returns: None The validation is quite basic: the program will loop over the subdirectories of output_dir, matches them against the dictionary entries specified by sample variable and counts the number of lines in each RLE file. If the number of files doesn't match to the number of entries in the corresponding ROOT file, the user will be notified about such discrepancies. In principle, the script could also print relevant commands to fix the issues (and dump them to an easily executable file) but let's leave it for another time. ''' if verbose: logging.getLogger().setLevel(logging.DEBUG) root_file_regex = re.compile('^tree_(\d+).root$') file_dict = {k: [] for k in ['excess', 'missing', 'corrupted']} try: for s_key, s_value in samples.iteritems(): sample_name = s_value['process_name_specific'] sample_dir = os.path.join(output_dir, sample_name) if os.path.isdir(sample_dir): logging.debug("Found sample directory {sample_dir}".format( sample_dir=sample_dir)) #NB! assume that there are no secondary paths in the dictionary (hence index 0!) sample_path_dict = s_value['local_paths'][0] sample_path = sample_path_dict['path'] blacklist = sample_path_dict['blacklist'] for sample_subdir in os.listdir(sample_path): sample_subpath_idx = -1 try: sample_subpath_idx = int(sample_subdir) except ValueError: continue if sample_subpath_idx < 0: raise ValueError("Internal error") sample_subpath = os.path.join(sample_path, sample_subdir) logging.debug( "Processing sample subdirectory {sample_subpath}". format(sample_subpath=sample_subpath)) for sample_file in os.listdir(sample_subpath): sample_file_fullpath = os.path.join( sample_subpath, sample_file) if not sample_file.endswith( '.root') or not os.path.isfile( sample_file_fullpath): continue root_file_regex_match = root_file_regex.search( sample_file) if not root_file_regex_match: continue root_file_idx = int(root_file_regex_match.group(1)) expected_rle_file_basename = '{root_file_idx}.txt'.format( root_file_idx=root_file_idx) expected_rle_file = os.path.join( sample_dir, expected_rle_file_basename) file_dict_entry = (expected_rle_file, sample_file_fullpath) if root_file_idx in blacklist: if os.path.isfile(expected_rle_file): logging.warning( 'Found RLE file {rle_file} (corresponding to blacklisted {root_file}) ' 'which you ought to delete'.format( rle_file=expected_rle_file, root_file=sample_file_fullpath, )) file_dict['excess'].append(file_dict_entry) continue if not os.path.isfile(expected_rle_file): logging.warning( 'Missing RLE file {rle_file} (corresponding to {root_file})' .format( rle_file=expected_rle_file, root_file=sample_file_fullpath, )) file_dict['missing'].append(file_dict_entry) continue nof_rle_events = raw_linecount(expected_rle_file) if nof_rle_events == 1 and os.path.getsize( expected_rle_file) == 1: # the RLE file contains only a newline, hence no events nof_rle_events = 0 root_file = ROOT.TFile(sample_file_fullpath, 'read') root_tree = root_file.Get('tree') nof_entries = root_tree.GetEntries() nof_events_diff = nof_rle_events - nof_entries if nof_events_diff < 0: logging.error( 'Missing {nof_events} events in {rle_filename} (corresponding to {sample_file}): ' 'expected {expected}, got {actual}'.format( nof_events=abs(nof_events_diff), rle_filename=expected_rle_file, sample_file=sample_file_fullpath, expected=nof_entries, actual=nof_rle_events, )) file_dict['corrupted'].append(file_dict_entry) elif nof_events_diff > 0: logging.error( 'Got {nof_events} more event than expected in {rle_filename} (corresponding ' 'to {sample_file}): expected {expected}, got {actual}' .format( nof_events=nof_events_diff, rle_filename=expected_rle_file, sample_file=sample_file_fullpath, expected=nof_entries, actual=nof_rle_events, )) file_dict['corrupted'].append(file_dict_entry) else: logging.debug( 'File {rle_filename} (corresponding to {sample_file}) looks OK' .format( rle_filename=expected_rle_file, sample_file=sample_file_fullpath, )) except KeyboardInterrupt: pass if any(map(bool, file_dict.values())): logging.info('Validation finished with errors') for key in file_dict.keys(): if file_dict[key]: logging.info('Number of {key} RLE files: {nof_key}'.format( key=key, nof_key=len(file_dict[key]))) for entry in file_dict[key]: logging.info('{rle_file} <=> {sample_file}'.format( rle_file=entry[0], sample_file=entry[1])) else: logging.info('Validation finished successfully') return
root_file_basename = 'tree_%d.root' % new_idx root_file_idx = new_idx // 1000 dst_subdir = os.path.join(destination, '%04d' % root_file_idx) if dst_subdir not in missing_subdirs: missing_subdirs.append(dst_subdir) copy_relations[root_file] = os.path.join(dst_subdir, root_file_basename) if args.copy: for missing_subdir in missing_subdirs: if not hdfs.isdir(missing_subdir): logging.info('Created subdirectory {}'.format(missing_subdir)) if hdfs.mkdirs(missing_subdir) != 0: raise RuntimeError("Unable to create directory: %s" % missing_subdir) for src_file, dst_file in copy_relations.items(): logging.debug('Copying file {} to {}'.format(src_file, dst_file)) if args.copy: if hdfs.copy(src_file, dst_file, overwrite = False) != 0: raise RuntimeError("Unable to copy file from %s to %s" % (src_file, dst_file)) logging.info('Copying done') new_lines[os.path.dirname(destination)] = ( chunks[chunk_1] * len(file_list_1) / 100. + chunks[chunk_2] * len(file_list_2) / 100. ) / (len(file_list_1) + len(file_list_2)) * 100. if args.modify: with open(args.input, 'w') as input_list_file: input_list_file.write('\n'.join(file_input) + '\n') input_list_file.write('\n'.join(map(lambda kv: '{} {:.2f}%'.format(*kv), new_lines.items())) + '\n') logging.info('Rewrote file {}'.format(args.input))
def memJobList(self, inputFileList, rle_whitelist): ''' Args: inputFileList:{ int, array of strings }; i.e. fileset* ID and the list of files * if the script were to generate configuration files, this number would correspond to job ID Returns: { int : { str : int, str : [str, str, ...], str : [int, int] } } | | | | job id "fileset_id" "input_fileset" "event_range" The function reads a given set of files and determines the event range ''' memJobDict = {} jobId = 0 apply_rle_filter = bool(self.rle_filter_file) for filesetId, inputFileSet in inputFileList.iteritems(): memJobDict_common = { 'fileset_id' : filesetId, 'input_fileset' : inputFileSet } print("Processing file %s" % inputFileSet) ch = ROOT.TChain(self.treeName) for fn in inputFileSet: # chaining a file logging.debug("Processing file {fileName}".format(fileName = fn)) ch.AddFile(fn) nof_entries = ch.GetEntries() memJobDict_common['nof_entries'] = nof_entries if nof_entries == 0: jobId += 1 memJobDict[jobId] = dict({ 'event_range' : [0, 0], 'nof_int' : 0, 'nof_int_pass' : 0, 'nof_events_pass' : 0, 'nof_zero' : 0, }, **memJobDict_common) continue current_pos = 0 evt_ranges = [] counter, counter_arr = 0, [] nof_events_pass_counter, nof_events_pass = 0, [] nof_int_pass_counter, nof_int_pass = 0, [] nof_zero_integrations, nof_events_zero = 0, [] whitelist_all, whitelist_running = [], [] run = array.array('I', [0]) luminosityBlock = array.array('I', [0]) event = array.array('L', [0]) maxPermutations_addMEM = array.array('i', [0]) ch.SetBranchAddress("run", run) ch.SetBranchAddress("luminosityBlock", luminosityBlock) ch.SetBranchAddress("event", event) if self.maxPermutations_branchName is not None and self.maxPermutations_branchName != "": ch.SetBranchAddress(self.maxPermutations_branchName, maxPermutations_addMEM) else: maxPermutations_addMEM[0] = 1 for i in range(nof_entries): ch.GetEntry(i) if i > 0 and i % 10000 == 0: print(" Processing event %i/%i" % (i, nof_entries)) logging.debug("Processing event %i/%i" % (i, nof_entries)) rle = ':'.join(map(lambda nr: str(nr[0]), [ run, luminosityBlock, event ])) nof_integrations = maxPermutations_addMEM[0] if apply_rle_filter: if rle in rle_whitelist: if not (nof_integrations > 0): logging.error("Expected non-zero # integrations in event {}, but got {}".format(rle, nof_integrations)) nof_integrations = 1 else: nof_integrations = 0 if nof_integrations < 0: nof_integrations = 0 if nof_integrations >= 1: nof_events_pass_counter += 1 nof_int_pass_counter += nof_integrations else: nof_zero_integrations += 1 if nof_integrations > self.mem_integrations_per_job: raise ValueError("Too many nof_integrations = %d in file(s) %s at %d:%d:%d" % (nof_integrations, ', '.join(inputFileSet), ch.run, ch.lumi, ch.evt)) if (counter + nof_integrations) > self.mem_integrations_per_job: if evt_ranges: evt_ranges.append([evt_ranges[-1][1], current_pos]) else: evt_ranges.append([0, current_pos]) counter_arr.append(counter) counter = 0 nof_events_pass.append(nof_events_pass_counter) nof_events_pass_counter = 0 nof_int_pass.append(nof_int_pass_counter) nof_int_pass_counter = 0 nof_events_zero.append(nof_zero_integrations) nof_zero_integrations = 0 if apply_rle_filter: whitelist_all.append(whitelist_running) whitelist_running = [] if rle in rle_whitelist: whitelist_running.append(rle) counter += nof_integrations current_pos += 1 if counter <= self.mem_integrations_per_job and counter >= 0: if evt_ranges: evt_ranges.append([evt_ranges[-1][1], int(nof_entries)]) else: evt_ranges.append([0, int(nof_entries)]) counter_arr.append(counter) nof_events_pass.append(nof_events_pass_counter) nof_int_pass.append(nof_int_pass_counter) nof_events_zero.append(nof_zero_integrations) if apply_rle_filter: whitelist_all.append(whitelist_running) # ensure that the event ranges won't overlap (i.e. there won't be any double-processing of any event) evt_ranges_cat = [] for v in [range(x[0], x[1]) for x in evt_ranges]: evt_ranges_cat += v assert(evt_ranges_cat == range(nof_entries)) assert(bool(evt_ranges)) for i in range(len(evt_ranges)): if self.max_jobs_per_sample == -1 or jobId < self.max_jobs_per_sample: jobId += 1 memJobDict[jobId] = dict({ 'event_range' : evt_ranges[i], 'nof_int' : counter_arr[i], 'nof_int_pass' : nof_int_pass[i], 'nof_events_pass' : nof_events_pass[i], 'nof_zero' : nof_events_zero[i], 'whitelist' : whitelist_all[i] if apply_rle_filter else [], }, **memJobDict_common) # we now have all event ranges per one file, let's add them to the dictionary del ch return memJobDict
sys_option = 'central' elif 'CMS' in rle_file: sys_option = rle_file[rle_file.find('CMS') : rle_file.find(rle_file.split('_')[-1]) - 1] else: raise RuntimeError('Unrecognizable file: %s' % rle_file_path) assert(sys_option) rle_arr = set() with open(rle_file_path, 'r') as rle_file_ptr: for line in rle_file_ptr: rle_arr.add(line.rstrip('\n')) rles[channel][region_name][sample_name][sys_option] = copy.deepcopy(rle_arr) sys_options = list(sorted(rles[channel][region_name][sample_name].keys())) if 'central' in sys_options: sys_options.remove('central') sys_options = [ 'central' ] + sys_options header = [ '' ] + sys_options rows = [ header ] for sys_option_outer in sys_options: row = [] row.append(sys_option_outer) for sys_option_outer in rles[channel][region_name][sample_name]: for sys_option_inner in rles[channel][region_name][sample_name]: outer_set = rles[channel][region_name][sample_name][sys_option_outer] inner_set = rles[channel][region_name][sample_name][sys_option_inner] logging.debug('{} vs {}: {} common, {} ({}) exclusively in {} ({})'.format( sys_option_outer, sys_option_inner, len(outer_set & inner_set), len(outer_set - inner_set), len(inner_set - outer_set), sys_option_outer, sys_option_inner ))
def plot(input_files, output_files, title, expected_neff, mode): histogram_dict = {} for sample_name, sample_entry in input_files.items(): if not hdfs.isfile(sample_entry['input']): logging.error('Could not find file {}'.format(sample_entry['input'])) continue root_file = ROOT.TFile.Open(sample_entry['input'], 'read') logging.debug('Opened file {}'.format(sample_entry['input'])) root_directories = list(filter( lambda root_dir: root_dir != None, [ root_file.Get(os.path.join(key.GetName(), mode, 'genEvt')) \ for key in root_file.GetListOfKeys() if key.GetClassName() == 'TDirectoryFile' ] )) if len(root_directories) != 1: raise RuntimeError('Expected single directory in %s' % sample_entry['input']) root_dir = root_directories[0] histogram_dirs = [ root_dir.Get(key.GetName()) \ for key in root_dir.GetListOfKeys() if key.GetClassName() == 'TDirectoryFile' ] if len(histogram_dirs) != 1: raise RuntimeError( 'Expected single directory containing lumiScale histograms in %s' % sample_entry['input'] ) histogram_dir = histogram_dirs[0] histograms = [ key.GetName() for key in histogram_dir.GetListOfKeys() \ if key.GetClassName().startswith('TH1') and 'lumiScale' in key.GetName() ] for histogram_name_actual in histograms: histogram_name = histogram_name_actual.replace('_lumiScale', '').replace('CMS_ttHl_', '') \ if histogram_name_actual != 'lumiScale' else 'central' histogram = histogram_dir.Get(histogram_name_actual).Clone() histogram.SetDirectory(0) if histogram.GetEntries() != sample_entry['nentries'] and mode == 'unbiased': raise RuntimeError('Expected {} entries from {} in file {}, but got {} entries'.format( sample_entry['nentries'], histogram_name, sample_entry['input'], histogram.GetEntries(), )) if histogram_name not in histogram_dict: histogram_dict[histogram_name] = { 'histogram' : histogram, 'nentries' : histogram.GetEntries(), 'nfiles' : 1, } else: histogram_dict[histogram_name]['histogram'].Add(histogram) histogram_dict[histogram_name]['nentries'] += histogram.GetEntries() histogram_dict[histogram_name]['nfiles'] += 1 root_file.Close() if not histogram_dict: logging.error('Could not find histograms for samples {}'.format(', '.join(list(input_files.keys())))) return if len(set(histogram_dict[histogram_name]['nfiles'] for histogram_name in histogram_dict)) != 1: raise RuntimeError( 'Inconsistent number of files found for samples %s' % ', '.join(list(input_files.keys())) ) if len(set(histogram_dict[histogram_name]['nentries'] for histogram_name in histogram_dict)) != 1: raise RuntimeError( 'Inconsistent number of entries found in samples %s' % ', '.join(list(input_files.keys())) ) min_y = -1 max_y = -1 nentries = -1 for histograms in histogram_dict.values(): histogram = histograms['histogram'] y_content = histogram.GetBinContent(1) y_error = histogram.GetBinError(1) y_down = y_content - y_error y_up = y_content + y_error if min_y < 0: min_y = y_down if max_y < 0: max_y = y_up if y_down < min_y: min_y = y_down if y_up > max_y: max_y = y_up if nentries < 0: nentries = histograms['nentries'] else: assert(nentries == histograms['nentries']) if not (y_down < expected_neff < y_up) and mode == 'unbiased': logging.warning( "Effective event count {} not within {} +- {}".format(expected_neff, y_content, y_error) ) if mode == 'unbiased': min_y = min(min_y, expected_neff) max_y = max(max_y, expected_neff) diff = 0.2 * (max_y - min_y) min_y -= diff max_y += diff canvas = ROOT.TCanvas('c', 'c', 1200, 900) canvas.SetGrid() ROOT.gStyle.SetOptStat(0) legend = ROOT.TLegend(0.1, 0.7, 0.48, 0.9) legend.SetHeader('N_{eff} (%d entries)' % nentries) expected_histogram = None line_width = 3 marker_style = 20 fill_style = 4000 lines = [] for idx, histogram_name in enumerate(sorted(histogram_dict.keys())): histogram = histogram_dict[histogram_name]['histogram'] color = 2 + idx histogram.SetTitle(title) histogram.SetAxisRange(min_y, max_y, "Y") histogram.SetLineColor(color) histogram.SetMarkerColor(color) histogram.SetLineWidth(line_width) histogram.SetMarkerStyle(marker_style) histogram.SetFillStyle(fill_style) histogram.Draw("l e1%s" % (" same" if idx > 0 else "")) y_content = histogram.GetBinContent(1) y_error = histogram.GetBinError(1) y_up = y_content + y_error y_down = y_content - y_error bin_width = histogram.GetBinWidth(1) bin_center = histogram.GetBinCenter(1) line_min_x = bin_center - bin_width / 4 line_max_x = bin_center + bin_width / 4 line_down = ROOT.TLine(line_min_x, y_down, line_max_x, y_down) line_down.SetLineColor(color) line_down.SetLineWidth(line_width) line_down.Draw() lines.append(line_down) line_up = ROOT.TLine(line_min_x, y_up, line_max_x, y_up) line_up.SetLineColor(color) line_up.SetLineWidth(line_width) line_up.Draw() lines.append(line_up) sig_digits = max(8 - int(math.ceil(math.log10(y_content))), 1) if y_content > 0. else 1 leg_pattern = '%s (%.{}f #pm %.{}f)'.format(sig_digits, sig_digits) leg_name = leg_pattern % (histogram_name, y_content, y_error) legend.AddEntry(histogram, leg_name) logging.debug( 'Effective event count for the sys unc option {} is {} +- {}'.format( histogram_name, y_content, y_error ) ) if not expected_histogram and mode == 'unbiased': expected_histogram = histogram.Clone() expected_histogram.Reset() expected_histogram.SetBinContent(1, expected_neff) expected_histogram.SetBinError(1, 0) expected_histogram.SetLineColor(ROOT.kBlack) expected_histogram.SetMarkerColor(ROOT.kBlack) expected_histogram.SetLineWidth(line_width) expected_histogram.SetMarkerStyle(marker_style) expected_histogram.SetLineStyle(9) expected_histogram.SetFillStyle(fill_style) if expected_histogram: logging.debug('Expecting {} events'.format(expected_neff)) expected_histogram.Draw("e2 same") legend.AddEntry(expected_histogram, 'expected (%.1f)' % expected_neff) legend.Draw() for output_file in output_files: canvas.SaveAs(output_file) canvas.Close() legend.Delete() if expected_histogram: expected_histogram.Delete() for histogram_name in histogram_dict: histogram_dict[histogram_name]['histogram'].Delete() for line in lines: line.Delete()
logging.warning("Creating directory: {}".format(output_dir)) os.makedirs(output_dir) input_file = ROOT.TFile.Open(input_fn, 'read') assert (input_file) input_tree = input_file.Get('Events') denominator_process = Hist2D(BINNING_MHH[binning_choice], BINNING_COSTHETASTAR, name=process_name) denominator_category = Hist2D(BINNING_MHH[binning_choice], BINNING_COSTHETASTAR, name=category_name) nof_events = input_tree.GetEntries() logging.debug("Input file {} has {} events".format(input_fn, nof_events)) has_evt_brs = not bool({MHH_BR_NAME, COST_BR_NAME} - set(br.GetName() for br in input_tree.GetListOfBranches())) if has_evt_brs: logging.debug( "Input file {} already contains necessary event-level branches".format( input_fn)) mhh_br = array.array('f', [0.]) cost_br = array.array('f', [0.]) genweight_br = array.array('f', [0.]) input_tree.SetBranchAddress(MHH_BR_NAME, mhh_br) input_tree.SetBranchAddress(COST_BR_NAME, cost_br)
def __init__(self, nn="hdfs-nn", port=9000): '''Set up runtime parameters for performing direct queries to HDFS :param nn: Name of the name node :param port: HDFS service port The constructor does the following three things: - set up necessary environment variables in order to perform queries to HDFS - dynamically load the library calls from libhdfs.so and define the return types for each call - build the connection to Hadoop service and local file system ''' self.nn = nn self.port = port logging.debug("Setting environment variables") hadoop_prefix = '/usr/lib/hadoop' library_path = os.path.join(hadoop_prefix, 'lib/native') log_dir = os.path.join(hadoop_prefix, 'logs') log_file = 'hadoop.log' home_dir = hadoop_prefix policy_file = 'hadoop-policy.xml' id_str = '' log_level = 'INFO' preferIPv4Stack = True heapsize = 20480 memsize = 2048 cachesize = 512 malloc_arena_max = 4 # source /usr/lib/hadoop/libexec/hadoop-config.sh && unset HADOOP_HDFS_HOME # Unfortunately, it not possible to completely suppress JVM stack trace in case the call to a Java library throws # an exception. Tried both '-XX:-StackTraceInThrowable' and '-XX:MaxJavaStackTraceDepth=0' in both 'HADOOP_OPTS' # and 'JAVA_TOOL_OPTIONS' environment variables but to no avail. os.environ['JAVA_HOME'] = '' os.environ['LIBHDFS_OPTS'] = '-Xmx{}m'.format(heapsize) os.environ['HADOOP_CONF_DIR'] = '/etc/hadoop/conf' os.environ['MALLOC_ARENA_MAX'] = str(malloc_arena_max) os.environ['HADOOP_HEAPSIZE'] = str(heapsize) os.environ['HADOOP_PREFIX'] = '/usr/lib/hadoop' os.environ['HADOOP_OPTS'] = '-Xms{}m '.format(memsize) + \ '-XX:ReservedCodeCacheSize={}m '.format(cachesize) + \ '-Dhadoop.log.dir={} '.format(log_dir) + \ '-Dhadoop.log.file={} '.format(log_file) + \ '-Dhadoop.home.dir={} '.format(home_dir) + \ '-Dhadoop.id.str={} '.format(id_str) + \ '-Dhadoop.root.logger={},console '.format(log_level) + \ '-Dhadoop.policy.file={} '.format(policy_file) + \ '-Djava.library.path={} '.format(library_path) + \ '-Djava.net.preferIPv4Stack={}'.format('true' if preferIPv4Stack else 'false') if not os.environ['LD_LIBRARY_PATH'].endswith(':'): os.environ['LD_LIBRARY_PATH'] += ':' os.environ['LD_LIBRARY_PATH'] += library_path # hadoop classpath --glob classpath = [ "/etc/hadoop/conf", "/usr/lib/hadoop", "/usr/lib/hadoop/lib", "/usr/lib/hadoop-hdfs", "/usr/lib/hadoop-hdfs/lib", ] client_library = "/usr/lib/hadoop/client" classpath.extend( map( lambda jarfile: os.path.join(client_library, jarfile), filter(lambda filename: filename.endswith('.jar'), os.listdir(client_library)))) os.environ['CLASSPATH'] = ':'.join(classpath) lib_path = "/usr/lib64/libhdfs.so" if not os.path.isfile(lib_path): raise hdfsException("No such file: %s" % lib_path) logging.debug("Loading {lib}".format(lib=lib_path)) self.lib = ctypes.cdll.LoadLibrary(lib_path) self.lib.hdfsListDirectory.restype = ctypes.POINTER(_hdfs.hdfsFileInfo) self.lib.hdfsGetPathInfo.restype = ctypes.POINTER(_hdfs.hdfsFileInfo) self.lib.hdfsExists.restype = ctypes.c_int32 self.lib.hdfsDelete.restype = ctypes.c_int32 self.lib.hdfsCreateDirectory.restype = ctypes.c_int32 self.lib.hdfsChown.restype = ctypes.c_int32 self.lib.hdfsChmod.restype = ctypes.c_int32 self.lib.hdfsMove.restype = ctypes.c_int32 self.lib.hdfsCopy.restype = ctypes.c_int32 self.hdfsFileInfo_size = ctypes.sizeof(_hdfs.hdfsFileInfo) logging.debug("Building HDFS interface") self.bld = self.lib.hdfsNewBuilder() if not self.bld: raise hdfsException("Could not create new HDFS interface") self.lib.hdfsBuilderSetNameNode(self.bld, self.nn) self.lib.hdfsBuilderSetNameNodePort(self.bld, self.port) self.lbld = self.lib.hdfsNewBuilder() self.lib.hdfsBuilderSetNameNode(self.lbld, None) self.lib.hdfsBuilderSetNameNodePort(self.lbld, 0) logging.debug("Connecting to the HDFS interface") self.fs = self.lib.hdfsBuilderConnect(self.bld) if not self.fs: raise hdfsException( "Could not connect to the HDFS interface (nn = '%s', port = %d)" % (self.nn, self.port)) logging.debug("Interfacing to the local file system") self.lfs = self.lib.hdfsBuilderConnect(self.lbld) if not self.lfs: raise hdfsException( "Could not create interface to local file system")
def run_brilcalc(hlt_paths_in, json, normtag, units, brilcalc_path, data_file, output_dir): assert (all(map(lambda hlt_path: hlt_path.startswith('HLT'), hlt_paths_in))) hlt_paths = {hlt_path: hlt_version(hlt_path) for hlt_path in hlt_paths_in} for input_file in (json, normtag): if input_file and not os.path.isfile(input_file): raise ValueError("No such file: %s" % input_file) if not os.path.isfile(brilcalc_path): raise ValueError("No such file: %s" % brilcalc_path) if data_file: data = parse_data(data_file) if data['normtag'] != os.path.basename(normtag): logging.warning( "File {} is generated with normtag '{}' but requested using normtag '{}'" .format(data_file, data['normtag'], normtag)) if data['json'] != os.path.basename(json): logging.warning( "File {} is generated with JSON '{}' but requested using JSON '{}'" .format(data_file, data['json'], json)) else: data = None if output_dir and not os.path.isdir(output_dir): os.makedirs(output_dir) # prepare the jobs pool_size = 16 pool = multiprocessing.Pool(pool_size, handle_worker) logging.debug("Constructing pool for {} HLT paths".format(len(hlt_paths))) for hlt_path in hlt_paths: pool.apply_async( process_hlt, args=(hlt_paths[hlt_path], json, brilcalc_path, normtag, units, output_dir), callback=get_trigger_results, ) pool.close() pool.join() logging.debug("Pool finished") # parse trigger_results for hlt_path in hlt_paths: dict_entry = trigger_results[hlt_paths[hlt_path]] if data_file: present_eras = [] for run in dict_entry['runs']: for era in data['runs']: if data['runs'][era]['run_start'] <= run <= data['runs'][ era]['run_end'] and era not in present_eras: present_eras.append(era) all_eras = [era for era in data['runs']] missing_eras = list(sorted(list(set(all_eras) - set(present_eras)))) expected_recording = data['totrecorded'] expected_delivery = data['totdelivered'] data_units = data['units'] unit_factor = 1000**(LUMI_UNITS.index(units) - LUMI_UNITS.index(data_units)) expected_recording *= unit_factor expected_delivery *= unit_factor prescale_recording = (expected_recording / dict_entry['recorded'] ) if dict_entry['recorded'] != 0. else -1. prescale_delivery = (expected_delivery / dict_entry['delivered'] ) if dict_entry['delivered'] != 0. else -1. if int(prescale_recording) == 1: prescale_msg = "NOT prescaled" elif int(prescale_recording) == -1: prescale_msg = "NOT recording anything?" else: prescale_msg = "prescale factor %.1f (%.1f from delivery)" % ( prescale_recording, prescale_delivery) prescale_msg += " (expected %.1f recorded, %.1f delivery; units = %s)" % ( expected_recording, expected_delivery, units) print("{} nrun = {} totdelivered = {} totrecorded = {} (units = {})". format( hlt_path, len(dict_entry['runs']), dict_entry['delivered'], dict_entry['recorded'], units, )) if data_file: print("{} present in eras: {} (missing in {} eras) => {}".format( hlt_path, ", ".join(present_eras), ", ".join(missing_eras) if missing_eras else "none of the", prescale_msg, )) for hlt_dict in dict_entry['paths']: print( "\t{} nfill = {} nrun = {} ncms = {} totdelivered = {} totrecorded = {}" .format( hlt_dict['hltpath'], hlt_dict['nfill'], hlt_dict['nrun'], hlt_dict['ncms'], hlt_dict['totdelivered'], hlt_dict['totrecorded'], ))
for dbs_name, sample_info in samples.items(): if dbs_name == 'sum_events': continue category = sample_info['sample_category'] if not is_nonresonant(category): continue if category not in weight_sums: weight_sums[category] = { scan_idx: [] for scan_idx in range(nof_weights) } root_files = glob.glob(sample_info["local_paths"][0]['path'] + "/*/*.root") logging.debug('Found {} files in sample {} (category {})'.format( len(root_files), sample_info['process_name_specific'], category, )) denom_title = category logging.debug('Loading denominator histogram {} from file {}'.format( denom_title, denom_file)) sumEvt = fileHH.Get(denom_title) assert (sumEvt) for root_file in root_files: logging.debug('Processing file: {}'.format(root_file)) tfile = ROOT.TFile.Open(root_file, 'read') assert (tfile) tree = tfile.Get(inputTree) assert (tree)
def process_hlt(hlt_path, golden_json, brilcalc_path, normtag, units, output_dir): brilcalc_cmd = '{brilcalc_path} lumi -c web {json} {normtag} --output-style csv --hltpath "{hlt_path}" -u {units}'.format( brilcalc_path=brilcalc_path, json=('-i {}'.format(golden_json) if golden_json else ''), normtag=('--normtag {}'.format(normtag) if normtag else ''), hlt_path=hlt_path, units=units, ) output_file = os.path.join(output_dir, '{}.csv'.format( hlt_path.replace('_v*', ''))) if output_dir else '' logging.debug("Running: {}".format(brilcalc_cmd)) brilcalc_run = subprocess.Popen(brilcalc_cmd, shell=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE) brilcalc_out, brilcalc_err = brilcalc_run.communicate() if output_file: with open(output_file, 'w') as output_file_ptr: output_file_ptr.write(brilcalc_out + '\n') if brilcalc_err: raise ValueError("brilcalc return an error: %s" % brilcalc_err) logging.debug("Parsing results for {}".format(hlt_path)) brilcalc_out_split = brilcalc_out.rstrip('\n').split('\n') brilcalc_out_split = list( map(lambda line: line.rstrip('\r'), brilcalc_out_split)) read_runs = 0 delivered, recorded = 0., 0. runs = [] hlt_paths = [] for line in brilcalc_out_split: if line.startswith("#run:fill,time,ncms,hltpath"): if read_runs == 0: read_runs = 1 elif line.startswith("#Summary:"): read_runs = -1 elif line.startswith("#HLT_"): line_split = line.replace('#', '').split(',') hlt_paths.append({ 'hltpath': line_split[0], 'nfill': int(line_split[1]), 'nrun': int(line_split[2]), 'ncms': int(line_split[3]), 'totdelivered': float(line_split[4]), 'totrecorded': float(line_split[5]), }) elif line.startswith("#Sum delivered"): delivered = float(line.split()[-1]) elif line.startswith("#Sum recorded"): recorded = float(line.split()[-1]) else: if read_runs > 0: runs.append(int(line.split(':')[0])) else: pass dict_entry = { 'runs': runs, 'delivered': delivered, 'recorded': recorded, 'paths': hlt_paths, } return hlt_path, dict_entry