def submit_resolution_dag(pairs_file, max_l1_pt, log_dir, append, pu_bins, eta_bins, common_input_files, force_submit=False): """Submit one makeResolutionPlots DAG for one pairs file. This will run makeResolutionPlots over exclusive and inclusive eta bins, and then finally hadd the results together. Parameters ---------- pairs_files : str, optional Pairs file to process. Must be full path. max_l1_pt : int, optional Maximum L1 pt to consider when making plots. log_dir : str, optional Directory for STDOUT/STDERR/LOG files. Should be on /storage. append : str, optional String to append to filenames to track various settings (e.g. PU bin). pu_bins : list[list[int, int]], optional List of PU bin edges. eta_bins : list[float], optional List of eta bin edges, including upper edge of last bin. force_submit : bool, optional If True, forces job submission even if proposed output files already exists. Oherwise, program quits before submission. """ cc.check_file_exists(pairs_file) # Setup output directory for res* files # e.g. if pairs file in DATASET/pairs/pairs.root # then output goes in DATASET/resolution/ out_dir = os.path.dirname(os.path.dirname(pairs_file)) out_dir = os.path.join(out_dir, 'resolution') cc.check_create_dir(out_dir, info=True) # Stem for output filename out_stem = os.path.splitext(os.path.basename(pairs_file))[0] out_stem = out_stem.replace("pairs_", "res_") # Loop over PU bins # --------------------------------------------------------------------- pu_bins = pu_bins or [[-99, 999]] # set ridiculous limits if no cut on PU status_files = [] for (pu_min, pu_max) in pu_bins: log_stem = 'res.$(cluster).$(process)' res_jobs = ht.JobSet(exe='python', copy_exe=False, filename='submit_resolution.condor', setup_script='worker_setup.sh', share_exe_setup=True, out_dir=log_dir, out_file=log_stem + '.out', err_dir=log_dir, err_file=log_stem + '.err', log_dir=log_dir, log_file=log_stem + '.log', cpus=1, memory='100MB', disk='100MB', transfer_hdfs_input=False, common_input_files=common_input_files, hdfs_store=out_dir) # For creating filenames later fmt_dict = dict(puMin=pu_min, puMax=pu_max, maxL1Pt=max_l1_pt) # Hold all output filenames res_output_files = [] # Add exclusive eta bins to this JobSet for ind, (eta_min, eta_max) in enumerate(pairwise(eta_bins)): out_file = out_stem + "_%d" % ind + append.format(**fmt_dict) + '.root' out_file = os.path.join(out_dir, out_file) res_output_files.append(out_file) job_args = ['makeResolutionPlots.py', pairs_file, out_file, '--excl', #'--maxPt', max_l1_pt, #'--PUmin', pu_min, '--PUmax', pu_max, '--etaInd', ind] res_job = ht.Job(name='res_%d' % ind, args=job_args, input_files=[pairs_file], output_files=[out_file]) res_jobs.add_job(res_job) # Add inclusive bins (central, forward, all) # remove the [0:1] to do all - currently central only 'cos HF broke for incl in ['central', 'forward', 'all'][0:1]: out_file = out_stem + "_%s" % incl + append.format(**fmt_dict) + '.root' out_file = os.path.join(out_dir, out_file) res_output_files.append(out_file) job_args = ['makeResolutionPlots.py', pairs_file, out_file, '--incl'] #, '--maxPt', max_l1_pt, # '--PUmin', pu_min, '--PUmax', pu_max] if incl != 'all': job_args.append('--%s' % incl) res_job = ht.Job(name='res_%s' % incl, args=job_args, input_files=[pairs_file], output_files=[out_file]) res_jobs.add_job(res_job) # Add hadd jobs # --------------------------------------------------------------------- log_stem = 'resHadd.$(cluster).$(process)' hadd_jobs = ht.JobSet(exe='hadd', copy_exe=False, filename='haddSmall.condor', setup_script="cmssw_setup.sh", share_exe_setup=True, out_dir=log_dir, out_file=log_stem + '.out', err_dir=log_dir, err_file=log_stem + '.err', log_dir=log_dir, log_file=log_stem + '.log', cpus=1, memory='100MB', disk='20MB', transfer_hdfs_input=False, hdfs_store=out_dir) # Construct final hadded file name final_file = os.path.join(out_dir, out_stem + append.format(**fmt_dict) + '.root') hadd_output = [final_file] hadd_args = hadd_output + res_output_files hadder = ht.Job(name='haddRes', args=hadd_args, input_files=res_output_files, output_files=hadd_output) hadd_jobs.add_job(hadder) # Add all jobs to DAG, with necessary dependencies # --------------------------------------------------------------------- stem = 'res_%s_%s' % (strftime("%H%M%S"), cc.rand_str(3)) res_dag = ht.DAGMan(filename='%s.dag' % stem, status_file='%s.status' % stem) for job in res_jobs: res_dag.add_job(job) res_dag.add_job(hadder, requires=[j for j in res_jobs]) # Check if any of the output files already exists - maybe we mucked up? # --------------------------------------------------------------------- if not force_submit: for f in [final_file] + res_output_files: if os.path.isfile(f): print 'ERROR: output file already exists - not submitting' print 'FILE:', f return 1 # res_dag.write() res_dag.submit() status_files.append(res_dag.status_file) print 'For all statuses:' print 'DAGstatus.py', ' '.join(status_files)
def submit_matcher_dag(exe, ntuple_dir, log_dir, l1_dir, ref_dir, deltaR, ref_min_pt, cleaning_cut, append, force_submit): """Submit one matcher DAG for one directory of ntuples. This will run `exe` over all Ntuple files and then hadd the results together. Parameters ---------- exe : str Name of executable. ntuple_dir : str Name of directory with L1Ntuples to run over. log_dir : str Directory for STDOUT/STDERR/LOG files. Should be on /storage. append : str String to append to filenames to track various settings (e.g. deltaR cut). l1_dir : str Name of TDirectory in Ntuple that holds L1 jets. ref_dir : str Name of TDirectory in Ntuple that holds reference jets. deltaR : float Maximum deltaR(L1, Ref) for a match. ref_min_pt : float Minimum pT cut on reference jets to be considered for matching. force_submit : bool If True, forces job submission even if proposed output files already exists. Oherwise, program quits before submission. """ # DAG for jobs stem = 'matcher_%s_%s' % (strftime("%H%M%S"), cc.rand_str(3)) matcher_dag = ht.DAGMan(filename=os.path.join(log_dir, '%s.dag' % stem), status_file=os.path.join(log_dir, '%s.status' % stem)) # JobSet for each matching job log_stem = 'matcher.$(cluster).$(process)' matcher_jobs = ht.JobSet(exe=find_executable(exe), copy_exe=True, filename='submit_matcher.condor', setup_script=None, out_dir=log_dir, out_file=log_stem + '.out', err_dir=log_dir, err_file=log_stem + '.err', log_dir=log_dir, log_file=log_stem + '.log', cpus=1, memory='100MB', disk='100MB', transfer_hdfs_input=False, share_exe_setup=True, hdfs_store=ntuple_dir) # For creating filenames later fmt_dict = dict() # Hold all output filenames match_output_files = [] # Additional files to copy across - JEC, etc common_input_files = [] # Add matcher job for each ntuple file for ind, ntuple in enumerate(os.listdir(ntuple_dir)): # if ind > 10: # break # Skip non-ntuple files if not ntuple.endswith('.root') or ntuple.startswith('pairs'): continue ntuple_abspath = os.path.join(ntuple_dir, ntuple) # Construct output name ntuple_name = os.path.splitext(ntuple)[0] # handle anything up to first underscore (L1Tree, L1Ntuple, ...) result = re.match(r'^[a-zA-Z0-9]*_', ntuple_name) if result: pairs_file = '%s_%s.root' % (ntuple_name.replace(result.group(), 'pairs_'), append.format(**fmt_dict)) else: pairs_file = 'pairs_%s_%s.root' % (ntuple_name, append.format(**fmt_dict)) out_file = os.path.join(ntuple_dir, pairs_file) match_output_files.append(out_file) # Add matching job job_args = ['-I', ntuple_abspath, '-O', out_file, '--refDir', ref_dir, '--l1Dir', l1_dir, '--draw 0', '--deltaR', deltaR, '--refMinPt', ref_min_pt] if cleaning_cut: job_args.extend(['--cleanJets', cleaning_cut]) input_files = common_input_files + [ntuple_abspath] match_job = ht.Job(name='match_%d' % ind, args=job_args, input_files=input_files, output_files=[out_file]) matcher_jobs.add_job(match_job) matcher_dag.add_job(match_job) # Construct final filename # --------------------------------------------------------------------- final_file = 'pairs_%s_%s.root' % (os.path.basename(ntuple_dir.rstrip('/')), append.format(**fmt_dict)) final_dir = os.path.join(os.path.dirname(ntuple_dir.rstrip('/')), 'pairs') cc.check_create_dir(final_dir, info=True) final_file = os.path.join(final_dir, final_file) log.info("Final file: %s", final_file) # Check if any of the output files already exists - maybe we mucked up? # --------------------------------------------------------------------- if not force_submit: for f in [final_file] + match_output_files: if os.path.isfile(f): raise RuntimeError('ERROR: output file already exists - not submitting.' '\nTo bypass, use -f flag. \nFILE: %s' % f) # Add in hadding jobs # --------------------------------------------------------------------- hadd_jobs = add_hadd_jobs(matcher_dag, matcher_jobs.jobs.values(), final_file, log_dir) # Add in job to delete individual and intermediate hadd files # --------------------------------------------------------------------- log_stem = 'matcherRm.$(cluster).$(process)' rm_jobs = ht.JobSet(exe='hadoop', copy_exe=False, filename='submit_matcherRm.condor', out_dir=log_dir, out_file=log_stem + '.out', err_dir=log_dir, err_file=log_stem + '.err', log_dir=log_dir, log_file=log_stem + '.log', cpus=1, memory='100MB', disk='10MB', transfer_hdfs_input=False, share_exe_setup=False, hdfs_store=ntuple_dir) for i, job in enumerate(chain(matcher_jobs, hadd_jobs[:-1])): pairs_file = job.output_files[0] rm_job = ht.Job(name='rm%d' % i, args=' fs -rm -skipTrash %s' % pairs_file.replace('/hdfs', '')) rm_jobs.add_job(rm_job) matcher_dag.add_job(rm_job, requires=hadd_jobs[-1]) # Submit # --------------------------------------------------------------------- # matcher_dag.write() matcher_dag.submit() return matcher_dag.status_file
def submit_resolution_dag(pairs_file, max_l1_pt, log_dir, append, pu_bins, eta_bins, common_input_files, force_submit=False): """Submit one makeResolutionPlots DAG for one pairs file. This will run makeResolutionPlots over exclusive and inclusive eta bins, and then finally hadd the results together. Parameters ---------- pairs_files : str, optional Pairs file to process. Must be full path. max_l1_pt : int, optional Maximum L1 pt to consider when making plots. log_dir : str, optional Directory for STDOUT/STDERR/LOG files. Should be on /storage. append : str, optional String to append to filenames to track various settings (e.g. PU bin). pu_bins : list[list[int, int]], optional List of PU bin edges. eta_bins : list[float], optional List of eta bin edges, including upper edge of last bin. force_submit : bool, optional If True, forces job submission even if proposed output files already exists. Oherwise, program quits before submission. """ cc.check_file_exists(pairs_file) # Setup output directory for res* files # e.g. if pairs file in DATASET/pairs/pairs.root # then output goes in DATASET/resolution/ out_dir = os.path.dirname(os.path.dirname(pairs_file)) out_dir = os.path.join(out_dir, 'resolution') cc.check_create_dir(out_dir, info=True) # Stem for output filename out_stem = os.path.splitext(os.path.basename(pairs_file))[0] out_stem = out_stem.replace("pairs_", "res_") # Loop over PU bins # --------------------------------------------------------------------- pu_bins = pu_bins or [[-99, 999]] # set ridiculous limits if no cut on PU status_files = [] for (pu_min, pu_max) in pu_bins: log_stem = 'res.$(cluster).$(process)' res_jobs = ht.JobSet(exe='python', copy_exe=False, filename='submit_resolution.condor', setup_script='worker_setup.sh', share_exe_setup=True, out_dir=log_dir, out_file=log_stem + '.out', err_dir=log_dir, err_file=log_stem + '.err', log_dir=log_dir, log_file=log_stem + '.log', cpus=1, memory='100MB', disk='100MB', transfer_hdfs_input=False, common_input_files=common_input_files, hdfs_store=out_dir) # For creating filenames later fmt_dict = dict(puMin=pu_min, puMax=pu_max, maxL1Pt=max_l1_pt) # Hold all output filenames res_output_files = [] # Add exclusive eta bins to this JobSet for ind, (eta_min, eta_max) in enumerate(pairwise(eta_bins)): out_file = out_stem + "_%d" % ind + append.format( **fmt_dict) + '.root' out_file = os.path.join(out_dir, out_file) res_output_files.append(out_file) job_args = [ 'makeResolutionPlots.py', pairs_file, out_file, '--excl', #'--maxPt', max_l1_pt, #'--PUmin', pu_min, '--PUmax', pu_max, '--etaInd', ind ] res_job = ht.Job(name='res_%d' % ind, args=job_args, input_files=[pairs_file], output_files=[out_file]) res_jobs.add_job(res_job) # Add inclusive bins (central, forward, all) # remove the [0:1] to do all - currently central only 'cos HF broke for incl in ['central', 'forward', 'all'][0:1]: out_file = out_stem + "_%s" % incl + append.format( **fmt_dict) + '.root' out_file = os.path.join(out_dir, out_file) res_output_files.append(out_file) job_args = [ 'makeResolutionPlots.py', pairs_file, out_file, '--incl' ] #, '--maxPt', max_l1_pt, # '--PUmin', pu_min, '--PUmax', pu_max] if incl != 'all': job_args.append('--%s' % incl) res_job = ht.Job(name='res_%s' % incl, args=job_args, input_files=[pairs_file], output_files=[out_file]) res_jobs.add_job(res_job) # Add hadd jobs # --------------------------------------------------------------------- log_stem = 'resHadd.$(cluster).$(process)' hadd_jobs = ht.JobSet(exe='hadd', copy_exe=False, filename='haddSmall.condor', setup_script="cmssw_setup.sh", share_exe_setup=True, out_dir=log_dir, out_file=log_stem + '.out', err_dir=log_dir, err_file=log_stem + '.err', log_dir=log_dir, log_file=log_stem + '.log', cpus=1, memory='100MB', disk='20MB', transfer_hdfs_input=False, hdfs_store=out_dir) # Construct final hadded file name final_file = os.path.join( out_dir, out_stem + append.format(**fmt_dict) + '.root') hadd_output = [final_file] hadd_args = hadd_output + res_output_files hadder = ht.Job(name='haddRes', args=hadd_args, input_files=res_output_files, output_files=hadd_output) hadd_jobs.add_job(hadder) # Add all jobs to DAG, with necessary dependencies # --------------------------------------------------------------------- stem = 'res_%s_%s' % (strftime("%H%M%S"), cc.rand_str(3)) res_dag = ht.DAGMan(filename='%s.dag' % stem, status_file='%s.status' % stem) for job in res_jobs: res_dag.add_job(job) res_dag.add_job(hadder, requires=[j for j in res_jobs]) # Check if any of the output files already exists - maybe we mucked up? # --------------------------------------------------------------------- if not force_submit: for f in [final_file] + res_output_files: if os.path.isfile(f): print 'ERROR: output file already exists - not submitting' print 'FILE:', f return 1 # res_dag.write() res_dag.submit() status_files.append(res_dag.status_file) print 'For all statuses:' print 'DAGstatus.py', ' '.join(status_files)
def submit_runCalib_dag(pairs_file, log_dir, append, pu_bins, eta_bins, common_input_files, force_submit=False): """Submit one runCalibration DAG for one pairs file. This will run runCalibration over exclusive and inclusive eta bins, and then finally hadd the results together. Parameters ---------- pairs_files : str, optional Pairs file to process. Must be full path. log_dir : str, optional Directory for STDOUT/STDERR/LOG files. Should be on /storage. append : str, optional String to append to filenames to track various settings (e.g. PU bin). pu_bins : list[list[int, int]], optional List of PU bin edges. eta_bins : list[float], optional List of eta bin edges, including upper edge of last bin. force_submit : bool, optional If True, forces job submission even if proposed output files already exists. Oherwise, program quits before submission. """ cc.check_file_exists(pairs_file) # Setup output directory for output* files # e.g. if pairs file in DATASET/pairs/pairs.root # then output goes in DATASET/output/ out_dir = os.path.dirname(os.path.dirname(pairs_file)) out_dir = os.path.join(out_dir, 'output') cc.check_create_dir(out_dir, info=True) # Stem for output filename out_stem = os.path.splitext(os.path.basename(pairs_file))[0] out_stem = out_stem.replace("pairs_", "output_") # Loop over PU bins # --------------------------------------------------------------------- pu_bins = pu_bins or [[-99, 999]] # set ridiculous limits if no cut on PU status_files = [] for (pu_min, pu_max) in pu_bins: log.info('**** Doing PU bin %g - %g', pu_min, pu_max) log_stem = 'runCalib.$(cluster).$(process)' runCalib_jobs = ht.JobSet(exe='python', copy_exe=False, filename=os.path.join(log_dir, 'submit_runCalib.condor'), setup_script='worker_setup.sh', share_exe_setup=True, out_dir=log_dir, out_file=log_stem + '.out', err_dir=log_dir, err_file=log_stem + '.err', log_dir=log_dir, log_file=log_stem + '.log', cpus=1, memory='100MB', disk='100MB', transfer_hdfs_input=False, common_input_files=common_input_files, hdfs_store=out_dir) # For creating filenames later fmt_dict = dict(puMin=pu_min, puMax=pu_max) # Hold all output filenames calib_output_files = [] # Add exclusive eta bins to this JobSet for ind, (eta_min, eta_max) in enumerate(pairwise(eta_bins)): out_file = out_stem + "_%d" % ind + append.format(**fmt_dict) + '.root' out_file = os.path.join(out_dir, out_file) calib_output_files.append(out_file) job_args = ['runCalibration.py', pairs_file, out_file, "--no-genjet-plots", '--stage2', '--no-correction-fit', '--PUmin', pu_min, '--PUmax', pu_max, '--etaInd', ind] calib_job = ht.Job(name='calib_%d' % ind, args=job_args, input_files=[pairs_file], output_files=[out_file]) runCalib_jobs.add_job(calib_job) # Add hadd jobs # --------------------------------------------------------------------- log_stem = 'runCalibHadd.$(cluster).$(process)' hadd_jobs = ht.JobSet(exe='hadd', copy_exe=False, share_exe_setup=True, filename=os.path.join(log_dir, 'haddSmall.condor'), setup_script="cmssw_setup.sh", out_dir=log_dir, out_file=log_stem + '.out', err_dir=log_dir, err_file=log_stem + '.err', log_dir=log_dir, log_file=log_stem + '.log', cpus=1, memory='100MB', disk='20MB', transfer_hdfs_input=False, hdfs_store=out_dir) # Construct final hadded file name final_file = os.path.join(out_dir, out_stem + append.format(**fmt_dict) + '.root') hadd_output = [final_file] hadd_args = hadd_output + calib_output_files hadder = ht.Job(name='haddRunCalib', args=hadd_args, input_files=calib_output_files, output_files=hadd_output) hadd_jobs.add_job(hadder) # Add all jobs to DAG, with necessary dependencies # --------------------------------------------------------------------- stem = 'runCalib_%s_%s' % (strftime("%H%M%S"), cc.rand_str(3)) calib_dag = ht.DAGMan(filename=os.path.join(log_dir, '%s.dag' % stem), status_file=os.path.join(log_dir, '%s.status' % stem)) for job in runCalib_jobs: calib_dag.add_job(job) calib_dag.add_job(hadder, requires=[j for j in runCalib_jobs]) # Check if any of the output files already exists - maybe we mucked up? # --------------------------------------------------------------------- if not force_submit: for f in [final_file] + calib_output_files: if os.path.isfile(f): raise RuntimeError('Output file already exists - not submitting.' '\nTo bypass, use -f flag. \nFILE: %s' % f) # calib_dag.write() calib_dag.submit() status_files.append(calib_dag.status_file) print 'For all statuses:' print 'DAGstatus.py', ' '.join(status_files) return status_files
def submit_matcher_dag(exe, ntuple_dir, log_dir, l1_dir, ref_dir, deltaR, ref_min_pt, cleaning_cut, append, force_submit): """Submit one matcher DAG for one directory of ntuples. This will run `exe` over all Ntuple files and then hadd the results together. Parameters ---------- exe : str Name of executable. ntuple_dir : str Name of directory with L1Ntuples to run over. log_dir : str Directory for STDOUT/STDERR/LOG files. Should be on /storage. append : str String to append to filenames to track various settings (e.g. deltaR cut). l1_dir : str Name of TDirectory in Ntuple that holds L1 jets. ref_dir : str Name of TDirectory in Ntuple that holds reference jets. deltaR : float Maximum deltaR(L1, Ref) for a match. ref_min_pt : float Minimum pT cut on reference jets to be considered for matching. force_submit : bool If True, forces job submission even if proposed output files already exists. Oherwise, program quits before submission. """ # DAG for jobs stem = 'matcher_%s_%s' % (strftime("%H%M%S"), cc.rand_str(3)) matcher_dag = ht.DAGMan(filename=os.path.join(log_dir, '%s.dag' % stem), status_file=os.path.join(log_dir, '%s.status' % stem)) # JobSet for each matching job log_stem = 'matcher.$(cluster).$(process)' matcher_jobs = ht.JobSet(exe=find_executable(exe), copy_exe=True, filename='submit_matcher.condor', setup_script=None, out_dir=log_dir, out_file=log_stem + '.out', err_dir=log_dir, err_file=log_stem + '.err', log_dir=log_dir, log_file=log_stem + '.log', cpus=1, memory='100MB', disk='100MB', transfer_hdfs_input=False, share_exe_setup=True, hdfs_store=ntuple_dir) # For creating filenames later fmt_dict = dict() # Hold all output filenames match_output_files = [] # Additional files to copy across - JEC, etc common_input_files = [] # Add matcher job for each ntuple file for ind, ntuple in enumerate(os.listdir(ntuple_dir)): # if ind > 10: # break # Skip non-ntuple files if not ntuple.endswith('.root') or ntuple.startswith('pairs'): continue ntuple_abspath = os.path.join(ntuple_dir, ntuple) # Construct output name ntuple_name = os.path.splitext(ntuple)[0] # handle anything up to first underscore (L1Tree, L1Ntuple, ...) result = re.match(r'^[a-zA-Z0-9]*_', ntuple_name) if result: pairs_file = '%s_%s.root' % (ntuple_name.replace( result.group(), 'pairs_'), append.format(**fmt_dict)) else: pairs_file = 'pairs_%s_%s.root' % (ntuple_name, append.format(**fmt_dict)) out_file = os.path.join(ntuple_dir, pairs_file) match_output_files.append(out_file) # Add matching job job_args = [ '-I', ntuple_abspath, '-O', out_file, '--refDir', ref_dir, '--l1Dir', l1_dir, '--draw 0', '--deltaR', deltaR, '--refMinPt', ref_min_pt ] if cleaning_cut: job_args.extend(['--cleanJets', cleaning_cut]) input_files = common_input_files + [ntuple_abspath] match_job = ht.Job(name='match_%d' % ind, args=job_args, input_files=input_files, output_files=[out_file]) matcher_jobs.add_job(match_job) matcher_dag.add_job(match_job) # Construct final filename # --------------------------------------------------------------------- final_file = 'pairs_%s_%s.root' % (os.path.basename( ntuple_dir.rstrip('/')), append.format(**fmt_dict)) final_dir = os.path.join(os.path.dirname(ntuple_dir.rstrip('/')), 'pairs') cc.check_create_dir(final_dir, info=True) final_file = os.path.join(final_dir, final_file) log.info("Final file: %s", final_file) # Check if any of the output files already exists - maybe we mucked up? # --------------------------------------------------------------------- if not force_submit: for f in [final_file] + match_output_files: if os.path.isfile(f): raise RuntimeError( 'ERROR: output file already exists - not submitting.' '\nTo bypass, use -f flag. \nFILE: %s' % f) # Add in hadding jobs # --------------------------------------------------------------------- hadd_jobs = add_hadd_jobs(matcher_dag, matcher_jobs.jobs.values(), final_file, log_dir) # Add in job to delete individual and intermediate hadd files # --------------------------------------------------------------------- log_stem = 'matcherRm.$(cluster).$(process)' rm_jobs = ht.JobSet(exe='hadoop', copy_exe=False, filename='submit_matcherRm.condor', out_dir=log_dir, out_file=log_stem + '.out', err_dir=log_dir, err_file=log_stem + '.err', log_dir=log_dir, log_file=log_stem + '.log', cpus=1, memory='100MB', disk='10MB', transfer_hdfs_input=False, share_exe_setup=False, hdfs_store=ntuple_dir) for i, job in enumerate(chain(matcher_jobs, hadd_jobs[:-1])): pairs_file = job.output_files[0] rm_job = ht.Job(name='rm%d' % i, args=' fs -rm -skipTrash %s' % pairs_file.replace('/hdfs', '')) rm_jobs.add_job(rm_job) matcher_dag.add_job(rm_job, requires=hadd_jobs[-1]) # Submit # --------------------------------------------------------------------- # matcher_dag.write() matcher_dag.submit() return matcher_dag.status_file
def submit_runCalib_dag(pairs_file, log_dir, append, pu_bins, eta_bins, common_input_files, force_submit=False): """Submit one runCalibration DAG for one pairs file. This will run runCalibration over exclusive and inclusive eta bins, and then finally hadd the results together. Parameters ---------- pairs_files : str, optional Pairs file to process. Must be full path. log_dir : str, optional Directory for STDOUT/STDERR/LOG files. Should be on /storage. append : str, optional String to append to filenames to track various settings (e.g. PU bin). pu_bins : list[list[int, int]], optional List of PU bin edges. eta_bins : list[float], optional List of eta bin edges, including upper edge of last bin. force_submit : bool, optional If True, forces job submission even if proposed output files already exists. Oherwise, program quits before submission. """ cc.check_file_exists(pairs_file) # Setup output directory for output* files # e.g. if pairs file in DATASET/pairs/pairs.root # then output goes in DATASET/output/ out_dir = os.path.dirname(os.path.dirname(pairs_file)) out_dir = os.path.join(out_dir, 'output') cc.check_create_dir(out_dir, info=True) # Stem for output filename out_stem = os.path.splitext(os.path.basename(pairs_file))[0] out_stem = out_stem.replace("pairs_", "output_") # Loop over PU bins # --------------------------------------------------------------------- pu_bins = pu_bins or [[-99, 999]] # set ridiculous limits if no cut on PU status_files = [] for (pu_min, pu_max) in pu_bins: log.info('**** Doing PU bin %g - %g', pu_min, pu_max) log_stem = 'runCalib.$(cluster).$(process)' runCalib_jobs = ht.JobSet(exe='python', copy_exe=False, filename='submit_runCalib.condor', setup_script='worker_setup.sh', share_exe_setup=True, out_dir=log_dir, out_file=log_stem + '.out', err_dir=log_dir, err_file=log_stem + '.err', log_dir=log_dir, log_file=log_stem + '.log', cpus=1, memory='100MB', disk='100MB', transfer_hdfs_input=False, common_input_files=common_input_files, hdfs_store=out_dir) # For creating filenames later fmt_dict = dict(puMin=pu_min, puMax=pu_max) # Hold all output filenames calib_output_files = [] # Add exclusive eta bins to this JobSet for ind, (eta_min, eta_max) in enumerate(pairwise(eta_bins)): out_file = out_stem + "_%d" % ind + append.format( **fmt_dict) + '.root' out_file = os.path.join(out_dir, out_file) calib_output_files.append(out_file) job_args = [ 'runCalibration.py', pairs_file, out_file, "--no-genjet-plots", '--stage2', '--no-correction-fit', '--PUmin', pu_min, '--PUmax', pu_max, '--etaInd', ind ] calib_job = ht.Job(name='calib_%d' % ind, args=job_args, input_files=[pairs_file], output_files=[out_file]) runCalib_jobs.add_job(calib_job) # Add hadd jobs # --------------------------------------------------------------------- log_stem = 'runCalibHadd.$(cluster).$(process)' hadd_jobs = ht.JobSet(exe='hadd', copy_exe=False, share_exe_setup=True, filename='haddSmall.condor', setup_script="cmssw_setup.sh", out_dir=log_dir, out_file=log_stem + '.out', err_dir=log_dir, err_file=log_stem + '.err', log_dir=log_dir, log_file=log_stem + '.log', cpus=1, memory='100MB', disk='20MB', transfer_hdfs_input=False, hdfs_store=out_dir) # Construct final hadded file name final_file = os.path.join( out_dir, out_stem + append.format(**fmt_dict) + '.root') hadd_output = [final_file] hadd_args = hadd_output + calib_output_files hadder = ht.Job(name='haddRunCalib', args=hadd_args, input_files=calib_output_files, output_files=hadd_output) hadd_jobs.add_job(hadder) # Add all jobs to DAG, with necessary dependencies # --------------------------------------------------------------------- stem = 'runCalib_%s_%s' % (strftime("%H%M%S"), cc.rand_str(3)) calib_dag = ht.DAGMan(filename=os.path.join(log_dir, '%s.dag' % stem), status_file=os.path.join(log_dir, '%s.status' % stem)) for job in runCalib_jobs: calib_dag.add_job(job) calib_dag.add_job(hadder, requires=[j for j in runCalib_jobs]) # Check if any of the output files already exists - maybe we mucked up? # --------------------------------------------------------------------- if not force_submit: for f in [final_file] + calib_output_files: if os.path.isfile(f): print 'ERROR: output file already exists - not submitting' print 'FILE:', f return 1 # calib_dag.write() calib_dag.submit() status_files.append(calib_dag.status_file) print 'For all statuses:' print 'DAGstatus.py', ' '.join(status_files) return status_files