def submit_resolution_dag(pairs_file, max_l1_pt, log_dir, append, pu_bins, eta_bins, common_input_files, force_submit=False): """Submit one makeResolutionPlots DAG for one pairs file. This will run makeResolutionPlots over exclusive and inclusive eta bins, and then finally hadd the results together. Parameters ---------- pairs_files : str, optional Pairs file to process. Must be full path. max_l1_pt : int, optional Maximum L1 pt to consider when making plots. log_dir : str, optional Directory for STDOUT/STDERR/LOG files. Should be on /storage. append : str, optional String to append to filenames to track various settings (e.g. PU bin). pu_bins : list[list[int, int]], optional List of PU bin edges. eta_bins : list[float], optional List of eta bin edges, including upper edge of last bin. force_submit : bool, optional If True, forces job submission even if proposed output files already exists. Oherwise, program quits before submission. """ cc.check_file_exists(pairs_file) # Setup output directory for res* files # e.g. if pairs file in DATASET/pairs/pairs.root # then output goes in DATASET/resolution/ out_dir = os.path.dirname(os.path.dirname(pairs_file)) out_dir = os.path.join(out_dir, 'resolution') cc.check_create_dir(out_dir, info=True) # Stem for output filename out_stem = os.path.splitext(os.path.basename(pairs_file))[0] out_stem = out_stem.replace("pairs_", "res_") # Loop over PU bins # --------------------------------------------------------------------- pu_bins = pu_bins or [[-99, 999]] # set ridiculous limits if no cut on PU status_files = [] for (pu_min, pu_max) in pu_bins: log_stem = 'res.$(cluster).$(process)' res_jobs = ht.JobSet(exe='python', copy_exe=False, filename='submit_resolution.condor', setup_script='worker_setup.sh', share_exe_setup=True, out_dir=log_dir, out_file=log_stem + '.out', err_dir=log_dir, err_file=log_stem + '.err', log_dir=log_dir, log_file=log_stem + '.log', cpus=1, memory='100MB', disk='100MB', transfer_hdfs_input=False, common_input_files=common_input_files, hdfs_store=out_dir) # For creating filenames later fmt_dict = dict(puMin=pu_min, puMax=pu_max, maxL1Pt=max_l1_pt) # Hold all output filenames res_output_files = [] # Add exclusive eta bins to this JobSet for ind, (eta_min, eta_max) in enumerate(pairwise(eta_bins)): out_file = out_stem + "_%d" % ind + append.format(**fmt_dict) + '.root' out_file = os.path.join(out_dir, out_file) res_output_files.append(out_file) job_args = ['makeResolutionPlots.py', pairs_file, out_file, '--excl', #'--maxPt', max_l1_pt, #'--PUmin', pu_min, '--PUmax', pu_max, '--etaInd', ind] res_job = ht.Job(name='res_%d' % ind, args=job_args, input_files=[pairs_file], output_files=[out_file]) res_jobs.add_job(res_job) # Add inclusive bins (central, forward, all) # remove the [0:1] to do all - currently central only 'cos HF broke for incl in ['central', 'forward', 'all'][0:1]: out_file = out_stem + "_%s" % incl + append.format(**fmt_dict) + '.root' out_file = os.path.join(out_dir, out_file) res_output_files.append(out_file) job_args = ['makeResolutionPlots.py', pairs_file, out_file, '--incl'] #, '--maxPt', max_l1_pt, # '--PUmin', pu_min, '--PUmax', pu_max] if incl != 'all': job_args.append('--%s' % incl) res_job = ht.Job(name='res_%s' % incl, args=job_args, input_files=[pairs_file], output_files=[out_file]) res_jobs.add_job(res_job) # Add hadd jobs # --------------------------------------------------------------------- log_stem = 'resHadd.$(cluster).$(process)' hadd_jobs = ht.JobSet(exe='hadd', copy_exe=False, filename='haddSmall.condor', setup_script="cmssw_setup.sh", share_exe_setup=True, out_dir=log_dir, out_file=log_stem + '.out', err_dir=log_dir, err_file=log_stem + '.err', log_dir=log_dir, log_file=log_stem + '.log', cpus=1, memory='100MB', disk='20MB', transfer_hdfs_input=False, hdfs_store=out_dir) # Construct final hadded file name final_file = os.path.join(out_dir, out_stem + append.format(**fmt_dict) + '.root') hadd_output = [final_file] hadd_args = hadd_output + res_output_files hadder = ht.Job(name='haddRes', args=hadd_args, input_files=res_output_files, output_files=hadd_output) hadd_jobs.add_job(hadder) # Add all jobs to DAG, with necessary dependencies # --------------------------------------------------------------------- stem = 'res_%s_%s' % (strftime("%H%M%S"), cc.rand_str(3)) res_dag = ht.DAGMan(filename='%s.dag' % stem, status_file='%s.status' % stem) for job in res_jobs: res_dag.add_job(job) res_dag.add_job(hadder, requires=[j for j in res_jobs]) # Check if any of the output files already exists - maybe we mucked up? # --------------------------------------------------------------------- if not force_submit: for f in [final_file] + res_output_files: if os.path.isfile(f): print 'ERROR: output file already exists - not submitting' print 'FILE:', f return 1 # res_dag.write() res_dag.submit() status_files.append(res_dag.status_file) print 'For all statuses:' print 'DAGstatus.py', ' '.join(status_files)
def submit_runCalib_dag(pairs_file, log_dir, append, pu_bins, eta_bins, common_input_files, force_submit=False): """Submit one runCalibration DAG for one pairs file. This will run runCalibration over exclusive and inclusive eta bins, and then finally hadd the results together. Parameters ---------- pairs_files : str, optional Pairs file to process. Must be full path. log_dir : str, optional Directory for STDOUT/STDERR/LOG files. Should be on /storage. append : str, optional String to append to filenames to track various settings (e.g. PU bin). pu_bins : list[list[int, int]], optional List of PU bin edges. eta_bins : list[float], optional List of eta bin edges, including upper edge of last bin. force_submit : bool, optional If True, forces job submission even if proposed output files already exists. Oherwise, program quits before submission. """ cc.check_file_exists(pairs_file) # Setup output directory for output* files # e.g. if pairs file in DATASET/pairs/pairs.root # then output goes in DATASET/output/ out_dir = os.path.dirname(os.path.dirname(pairs_file)) out_dir = os.path.join(out_dir, 'output') cc.check_create_dir(out_dir, info=True) # Stem for output filename out_stem = os.path.splitext(os.path.basename(pairs_file))[0] out_stem = out_stem.replace("pairs_", "output_") # Loop over PU bins # --------------------------------------------------------------------- pu_bins = pu_bins or [[-99, 999]] # set ridiculous limits if no cut on PU status_files = [] for (pu_min, pu_max) in pu_bins: log.info('**** Doing PU bin %g - %g', pu_min, pu_max) log_stem = 'runCalib.$(cluster).$(process)' runCalib_jobs = ht.JobSet(exe='python', copy_exe=False, filename=os.path.join(log_dir, 'submit_runCalib.condor'), setup_script='worker_setup.sh', share_exe_setup=True, out_dir=log_dir, out_file=log_stem + '.out', err_dir=log_dir, err_file=log_stem + '.err', log_dir=log_dir, log_file=log_stem + '.log', cpus=1, memory='100MB', disk='100MB', transfer_hdfs_input=False, common_input_files=common_input_files, hdfs_store=out_dir) # For creating filenames later fmt_dict = dict(puMin=pu_min, puMax=pu_max) # Hold all output filenames calib_output_files = [] # Add exclusive eta bins to this JobSet for ind, (eta_min, eta_max) in enumerate(pairwise(eta_bins)): out_file = out_stem + "_%d" % ind + append.format(**fmt_dict) + '.root' out_file = os.path.join(out_dir, out_file) calib_output_files.append(out_file) job_args = ['runCalibration.py', pairs_file, out_file, "--no-genjet-plots", '--stage2', '--no-correction-fit', '--PUmin', pu_min, '--PUmax', pu_max, '--etaInd', ind] calib_job = ht.Job(name='calib_%d' % ind, args=job_args, input_files=[pairs_file], output_files=[out_file]) runCalib_jobs.add_job(calib_job) # Add hadd jobs # --------------------------------------------------------------------- log_stem = 'runCalibHadd.$(cluster).$(process)' hadd_jobs = ht.JobSet(exe='hadd', copy_exe=False, share_exe_setup=True, filename=os.path.join(log_dir, 'haddSmall.condor'), setup_script="cmssw_setup.sh", out_dir=log_dir, out_file=log_stem + '.out', err_dir=log_dir, err_file=log_stem + '.err', log_dir=log_dir, log_file=log_stem + '.log', cpus=1, memory='100MB', disk='20MB', transfer_hdfs_input=False, hdfs_store=out_dir) # Construct final hadded file name final_file = os.path.join(out_dir, out_stem + append.format(**fmt_dict) + '.root') hadd_output = [final_file] hadd_args = hadd_output + calib_output_files hadder = ht.Job(name='haddRunCalib', args=hadd_args, input_files=calib_output_files, output_files=hadd_output) hadd_jobs.add_job(hadder) # Add all jobs to DAG, with necessary dependencies # --------------------------------------------------------------------- stem = 'runCalib_%s_%s' % (strftime("%H%M%S"), cc.rand_str(3)) calib_dag = ht.DAGMan(filename=os.path.join(log_dir, '%s.dag' % stem), status_file=os.path.join(log_dir, '%s.status' % stem)) for job in runCalib_jobs: calib_dag.add_job(job) calib_dag.add_job(hadder, requires=[j for j in runCalib_jobs]) # Check if any of the output files already exists - maybe we mucked up? # --------------------------------------------------------------------- if not force_submit: for f in [final_file] + calib_output_files: if os.path.isfile(f): raise RuntimeError('Output file already exists - not submitting.' '\nTo bypass, use -f flag. \nFILE: %s' % f) # calib_dag.write() calib_dag.submit() status_files.append(calib_dag.status_file) print 'For all statuses:' print 'DAGstatus.py', ' '.join(status_files) return status_files
def submit_resolution_dag(pairs_file, max_l1_pt, log_dir, append, pu_bins, eta_bins, common_input_files, force_submit=False): """Submit one makeResolutionPlots DAG for one pairs file. This will run makeResolutionPlots over exclusive and inclusive eta bins, and then finally hadd the results together. Parameters ---------- pairs_files : str, optional Pairs file to process. Must be full path. max_l1_pt : int, optional Maximum L1 pt to consider when making plots. log_dir : str, optional Directory for STDOUT/STDERR/LOG files. Should be on /storage. append : str, optional String to append to filenames to track various settings (e.g. PU bin). pu_bins : list[list[int, int]], optional List of PU bin edges. eta_bins : list[float], optional List of eta bin edges, including upper edge of last bin. force_submit : bool, optional If True, forces job submission even if proposed output files already exists. Oherwise, program quits before submission. """ cc.check_file_exists(pairs_file) # Setup output directory for res* files # e.g. if pairs file in DATASET/pairs/pairs.root # then output goes in DATASET/resolution/ out_dir = os.path.dirname(os.path.dirname(pairs_file)) out_dir = os.path.join(out_dir, 'resolution') cc.check_create_dir(out_dir, info=True) # Stem for output filename out_stem = os.path.splitext(os.path.basename(pairs_file))[0] out_stem = out_stem.replace("pairs_", "res_") # Loop over PU bins # --------------------------------------------------------------------- pu_bins = pu_bins or [[-99, 999]] # set ridiculous limits if no cut on PU status_files = [] for (pu_min, pu_max) in pu_bins: log_stem = 'res.$(cluster).$(process)' res_jobs = ht.JobSet(exe='python', copy_exe=False, filename='submit_resolution.condor', setup_script='worker_setup.sh', share_exe_setup=True, out_dir=log_dir, out_file=log_stem + '.out', err_dir=log_dir, err_file=log_stem + '.err', log_dir=log_dir, log_file=log_stem + '.log', cpus=1, memory='100MB', disk='100MB', transfer_hdfs_input=False, common_input_files=common_input_files, hdfs_store=out_dir) # For creating filenames later fmt_dict = dict(puMin=pu_min, puMax=pu_max, maxL1Pt=max_l1_pt) # Hold all output filenames res_output_files = [] # Add exclusive eta bins to this JobSet for ind, (eta_min, eta_max) in enumerate(pairwise(eta_bins)): out_file = out_stem + "_%d" % ind + append.format( **fmt_dict) + '.root' out_file = os.path.join(out_dir, out_file) res_output_files.append(out_file) job_args = [ 'makeResolutionPlots.py', pairs_file, out_file, '--excl', #'--maxPt', max_l1_pt, #'--PUmin', pu_min, '--PUmax', pu_max, '--etaInd', ind ] res_job = ht.Job(name='res_%d' % ind, args=job_args, input_files=[pairs_file], output_files=[out_file]) res_jobs.add_job(res_job) # Add inclusive bins (central, forward, all) # remove the [0:1] to do all - currently central only 'cos HF broke for incl in ['central', 'forward', 'all'][0:1]: out_file = out_stem + "_%s" % incl + append.format( **fmt_dict) + '.root' out_file = os.path.join(out_dir, out_file) res_output_files.append(out_file) job_args = [ 'makeResolutionPlots.py', pairs_file, out_file, '--incl' ] #, '--maxPt', max_l1_pt, # '--PUmin', pu_min, '--PUmax', pu_max] if incl != 'all': job_args.append('--%s' % incl) res_job = ht.Job(name='res_%s' % incl, args=job_args, input_files=[pairs_file], output_files=[out_file]) res_jobs.add_job(res_job) # Add hadd jobs # --------------------------------------------------------------------- log_stem = 'resHadd.$(cluster).$(process)' hadd_jobs = ht.JobSet(exe='hadd', copy_exe=False, filename='haddSmall.condor', setup_script="cmssw_setup.sh", share_exe_setup=True, out_dir=log_dir, out_file=log_stem + '.out', err_dir=log_dir, err_file=log_stem + '.err', log_dir=log_dir, log_file=log_stem + '.log', cpus=1, memory='100MB', disk='20MB', transfer_hdfs_input=False, hdfs_store=out_dir) # Construct final hadded file name final_file = os.path.join( out_dir, out_stem + append.format(**fmt_dict) + '.root') hadd_output = [final_file] hadd_args = hadd_output + res_output_files hadder = ht.Job(name='haddRes', args=hadd_args, input_files=res_output_files, output_files=hadd_output) hadd_jobs.add_job(hadder) # Add all jobs to DAG, with necessary dependencies # --------------------------------------------------------------------- stem = 'res_%s_%s' % (strftime("%H%M%S"), cc.rand_str(3)) res_dag = ht.DAGMan(filename='%s.dag' % stem, status_file='%s.status' % stem) for job in res_jobs: res_dag.add_job(job) res_dag.add_job(hadder, requires=[j for j in res_jobs]) # Check if any of the output files already exists - maybe we mucked up? # --------------------------------------------------------------------- if not force_submit: for f in [final_file] + res_output_files: if os.path.isfile(f): print 'ERROR: output file already exists - not submitting' print 'FILE:', f return 1 # res_dag.write() res_dag.submit() status_files.append(res_dag.status_file) print 'For all statuses:' print 'DAGstatus.py', ' '.join(status_files)
def submit_runCalib_dag(pairs_file, log_dir, append, pu_bins, eta_bins, common_input_files, force_submit=False): """Submit one runCalibration DAG for one pairs file. This will run runCalibration over exclusive and inclusive eta bins, and then finally hadd the results together. Parameters ---------- pairs_files : str, optional Pairs file to process. Must be full path. log_dir : str, optional Directory for STDOUT/STDERR/LOG files. Should be on /storage. append : str, optional String to append to filenames to track various settings (e.g. PU bin). pu_bins : list[list[int, int]], optional List of PU bin edges. eta_bins : list[float], optional List of eta bin edges, including upper edge of last bin. force_submit : bool, optional If True, forces job submission even if proposed output files already exists. Oherwise, program quits before submission. """ cc.check_file_exists(pairs_file) # Setup output directory for output* files # e.g. if pairs file in DATASET/pairs/pairs.root # then output goes in DATASET/output/ out_dir = os.path.dirname(os.path.dirname(pairs_file)) out_dir = os.path.join(out_dir, 'output') cc.check_create_dir(out_dir, info=True) # Stem for output filename out_stem = os.path.splitext(os.path.basename(pairs_file))[0] out_stem = out_stem.replace("pairs_", "output_") # Loop over PU bins # --------------------------------------------------------------------- pu_bins = pu_bins or [[-99, 999]] # set ridiculous limits if no cut on PU status_files = [] for (pu_min, pu_max) in pu_bins: log.info('**** Doing PU bin %g - %g', pu_min, pu_max) log_stem = 'runCalib.$(cluster).$(process)' runCalib_jobs = ht.JobSet(exe='python', copy_exe=False, filename='submit_runCalib.condor', setup_script='worker_setup.sh', share_exe_setup=True, out_dir=log_dir, out_file=log_stem + '.out', err_dir=log_dir, err_file=log_stem + '.err', log_dir=log_dir, log_file=log_stem + '.log', cpus=1, memory='100MB', disk='100MB', transfer_hdfs_input=False, common_input_files=common_input_files, hdfs_store=out_dir) # For creating filenames later fmt_dict = dict(puMin=pu_min, puMax=pu_max) # Hold all output filenames calib_output_files = [] # Add exclusive eta bins to this JobSet for ind, (eta_min, eta_max) in enumerate(pairwise(eta_bins)): out_file = out_stem + "_%d" % ind + append.format( **fmt_dict) + '.root' out_file = os.path.join(out_dir, out_file) calib_output_files.append(out_file) job_args = [ 'runCalibration.py', pairs_file, out_file, "--no-genjet-plots", '--stage2', '--no-correction-fit', '--PUmin', pu_min, '--PUmax', pu_max, '--etaInd', ind ] calib_job = ht.Job(name='calib_%d' % ind, args=job_args, input_files=[pairs_file], output_files=[out_file]) runCalib_jobs.add_job(calib_job) # Add hadd jobs # --------------------------------------------------------------------- log_stem = 'runCalibHadd.$(cluster).$(process)' hadd_jobs = ht.JobSet(exe='hadd', copy_exe=False, share_exe_setup=True, filename='haddSmall.condor', setup_script="cmssw_setup.sh", out_dir=log_dir, out_file=log_stem + '.out', err_dir=log_dir, err_file=log_stem + '.err', log_dir=log_dir, log_file=log_stem + '.log', cpus=1, memory='100MB', disk='20MB', transfer_hdfs_input=False, hdfs_store=out_dir) # Construct final hadded file name final_file = os.path.join( out_dir, out_stem + append.format(**fmt_dict) + '.root') hadd_output = [final_file] hadd_args = hadd_output + calib_output_files hadder = ht.Job(name='haddRunCalib', args=hadd_args, input_files=calib_output_files, output_files=hadd_output) hadd_jobs.add_job(hadder) # Add all jobs to DAG, with necessary dependencies # --------------------------------------------------------------------- stem = 'runCalib_%s_%s' % (strftime("%H%M%S"), cc.rand_str(3)) calib_dag = ht.DAGMan(filename=os.path.join(log_dir, '%s.dag' % stem), status_file=os.path.join(log_dir, '%s.status' % stem)) for job in runCalib_jobs: calib_dag.add_job(job) calib_dag.add_job(hadder, requires=[j for j in runCalib_jobs]) # Check if any of the output files already exists - maybe we mucked up? # --------------------------------------------------------------------- if not force_submit: for f in [final_file] + calib_output_files: if os.path.isfile(f): print 'ERROR: output file already exists - not submitting' print 'FILE:', f return 1 # calib_dag.write() calib_dag.submit() status_files.append(calib_dag.status_file) print 'For all statuses:' print 'DAGstatus.py', ' '.join(status_files) return status_files