def submit_resolution_dag(pairs_file, max_l1_pt, log_dir, append,
                          pu_bins, eta_bins, common_input_files,
                          force_submit=False):
    """Submit one makeResolutionPlots DAG for one pairs file.

    This will run makeResolutionPlots over exclusive and inclusive eta bins,
    and then finally hadd the results together.

    Parameters
    ----------
    pairs_files : str, optional
        Pairs file to process. Must be full path.

    max_l1_pt : int, optional
        Maximum L1 pt to consider when making plots.

    log_dir : str, optional
        Directory for STDOUT/STDERR/LOG files. Should be on /storage.

    append : str, optional
        String to append to filenames to track various settings (e.g. PU bin).

    pu_bins : list[list[int, int]], optional
        List of PU bin edges.

    eta_bins : list[float], optional
        List of eta bin edges, including upper edge of last bin.

    force_submit : bool, optional
        If True, forces job submission even if proposed output files
        already exists.
        Oherwise, program quits before submission.

    """
    cc.check_file_exists(pairs_file)

    # Setup output directory for res* files
    # e.g. if pairs file in DATASET/pairs/pairs.root
    # then output goes in DATASET/resolution/
    out_dir = os.path.dirname(os.path.dirname(pairs_file))
    out_dir = os.path.join(out_dir, 'resolution')
    cc.check_create_dir(out_dir, info=True)

    # Stem for output filename
    out_stem = os.path.splitext(os.path.basename(pairs_file))[0]
    out_stem = out_stem.replace("pairs_", "res_")

    # Loop over PU bins
    # ---------------------------------------------------------------------
    pu_bins = pu_bins or [[-99, 999]]  # set ridiculous limits if no cut on PU
    status_files = []
    for (pu_min, pu_max) in pu_bins:

        log_stem = 'res.$(cluster).$(process)'
        res_jobs = ht.JobSet(exe='python',
                             copy_exe=False,
                             filename='submit_resolution.condor',
                             setup_script='worker_setup.sh',
                             share_exe_setup=True,
                             out_dir=log_dir, out_file=log_stem + '.out',
                             err_dir=log_dir, err_file=log_stem + '.err',
                             log_dir=log_dir, log_file=log_stem + '.log',
                             cpus=1, memory='100MB', disk='100MB',
                             transfer_hdfs_input=False,
                             common_input_files=common_input_files,
                             hdfs_store=out_dir)

        # For creating filenames later
        fmt_dict = dict(puMin=pu_min, puMax=pu_max, maxL1Pt=max_l1_pt)

        # Hold all output filenames
        res_output_files = []

        # Add exclusive eta bins to this JobSet
        for ind, (eta_min, eta_max) in enumerate(pairwise(eta_bins)):
            out_file = out_stem + "_%d" % ind + append.format(**fmt_dict) + '.root'
            out_file = os.path.join(out_dir, out_file)
            res_output_files.append(out_file)

            job_args = ['makeResolutionPlots.py', pairs_file, out_file,
                        '--excl', #'--maxPt', max_l1_pt,
                        #'--PUmin', pu_min, '--PUmax', pu_max,
                        '--etaInd', ind]

            res_job = ht.Job(name='res_%d' % ind,
                             args=job_args,
                             input_files=[pairs_file],
                             output_files=[out_file])

            res_jobs.add_job(res_job)

        # Add inclusive bins (central, forward, all)
        # remove the [0:1] to do all - currently central only 'cos HF broke
        for incl in ['central', 'forward', 'all'][0:1]:
            out_file = out_stem + "_%s" % incl + append.format(**fmt_dict) + '.root'
            out_file = os.path.join(out_dir, out_file)
            res_output_files.append(out_file)

            job_args = ['makeResolutionPlots.py', pairs_file, out_file,
                        '--incl'] #, '--maxPt', max_l1_pt,
                        # '--PUmin', pu_min, '--PUmax', pu_max]
            if incl != 'all':
                job_args.append('--%s' % incl)

            res_job = ht.Job(name='res_%s' % incl,
                             args=job_args,
                             input_files=[pairs_file],
                             output_files=[out_file])

            res_jobs.add_job(res_job)

        # Add hadd jobs
        # ---------------------------------------------------------------------
        log_stem = 'resHadd.$(cluster).$(process)'

        hadd_jobs = ht.JobSet(exe='hadd',
                              copy_exe=False,
                              filename='haddSmall.condor',
                              setup_script="cmssw_setup.sh",
                              share_exe_setup=True,
                              out_dir=log_dir, out_file=log_stem + '.out',
                              err_dir=log_dir, err_file=log_stem + '.err',
                              log_dir=log_dir, log_file=log_stem + '.log',
                              cpus=1, memory='100MB', disk='20MB',
                              transfer_hdfs_input=False,
                              hdfs_store=out_dir)

        # Construct final hadded file name
        final_file = os.path.join(out_dir, out_stem + append.format(**fmt_dict) + '.root')
        hadd_output = [final_file]
        hadd_args = hadd_output + res_output_files

        hadder = ht.Job(name='haddRes',
                        args=hadd_args,
                        input_files=res_output_files,
                        output_files=hadd_output)

        hadd_jobs.add_job(hadder)

        # Add all jobs to DAG, with necessary dependencies
        # ---------------------------------------------------------------------
        stem = 'res_%s_%s' % (strftime("%H%M%S"), cc.rand_str(3))
        res_dag = ht.DAGMan(filename='%s.dag' % stem,
                            status_file='%s.status' % stem)
        for job in res_jobs:
            res_dag.add_job(job)

        res_dag.add_job(hadder, requires=[j for j in res_jobs])

        # Check if any of the output files already exists - maybe we mucked up?
        # ---------------------------------------------------------------------
        if not force_submit:
            for f in [final_file] + res_output_files:
                if os.path.isfile(f):
                    print 'ERROR: output file already exists - not submitting'
                    print 'FILE:', f
                    return 1

        # res_dag.write()
        res_dag.submit()
        status_files.append(res_dag.status_file)

    print 'For all statuses:'
    print 'DAGstatus.py', ' '.join(status_files)
def submit_runCalib_dag(pairs_file, log_dir, append, pu_bins, eta_bins, common_input_files,
                        force_submit=False):
    """Submit one runCalibration DAG for one pairs file.

    This will run runCalibration over exclusive and inclusive eta bins,
    and then finally hadd the results together.

    Parameters
    ----------
    pairs_files : str, optional
        Pairs file to process. Must be full path.

    log_dir : str, optional
        Directory for STDOUT/STDERR/LOG files. Should be on /storage.

    append : str, optional
        String to append to filenames to track various settings (e.g. PU bin).

    pu_bins : list[list[int, int]], optional
        List of PU bin edges.

    eta_bins : list[float], optional
        List of eta bin edges, including upper edge of last bin.

    force_submit : bool, optional
        If True, forces job submission even if proposed output files
        already exists.
        Oherwise, program quits before submission.

    """
    cc.check_file_exists(pairs_file)

    # Setup output directory for output* files
    # e.g. if pairs file in DATASET/pairs/pairs.root
    # then output goes in DATASET/output/
    out_dir = os.path.dirname(os.path.dirname(pairs_file))
    out_dir = os.path.join(out_dir, 'output')
    cc.check_create_dir(out_dir, info=True)

    # Stem for output filename
    out_stem = os.path.splitext(os.path.basename(pairs_file))[0]
    out_stem = out_stem.replace("pairs_", "output_")

    # Loop over PU bins
    # ---------------------------------------------------------------------
    pu_bins = pu_bins or [[-99, 999]]  # set ridiculous limits if no cut on PU
    status_files = []
    for (pu_min, pu_max) in pu_bins:
        log.info('**** Doing PU bin %g - %g', pu_min, pu_max)

        log_stem = 'runCalib.$(cluster).$(process)'
        runCalib_jobs = ht.JobSet(exe='python',
                                  copy_exe=False,
                                  filename=os.path.join(log_dir, 'submit_runCalib.condor'),
                                  setup_script='worker_setup.sh',
                                  share_exe_setup=True,
                                  out_dir=log_dir, out_file=log_stem + '.out',
                                  err_dir=log_dir, err_file=log_stem + '.err',
                                  log_dir=log_dir, log_file=log_stem + '.log',
                                  cpus=1, memory='100MB', disk='100MB',
                                  transfer_hdfs_input=False,
                                  common_input_files=common_input_files,
                                  hdfs_store=out_dir)

        # For creating filenames later
        fmt_dict = dict(puMin=pu_min, puMax=pu_max)

        # Hold all output filenames
        calib_output_files = []

        # Add exclusive eta bins to this JobSet
        for ind, (eta_min, eta_max) in enumerate(pairwise(eta_bins)):
            out_file = out_stem + "_%d" % ind + append.format(**fmt_dict) + '.root'
            out_file = os.path.join(out_dir, out_file)
            calib_output_files.append(out_file)

            job_args = ['runCalibration.py', pairs_file, out_file,
                        "--no-genjet-plots", '--stage2',
                        '--no-correction-fit',
                        '--PUmin', pu_min, '--PUmax', pu_max,
                        '--etaInd', ind]

            calib_job = ht.Job(name='calib_%d' % ind,
                               args=job_args,
                               input_files=[pairs_file],
                               output_files=[out_file])

            runCalib_jobs.add_job(calib_job)

        # Add hadd jobs
        # ---------------------------------------------------------------------
        log_stem = 'runCalibHadd.$(cluster).$(process)'

        hadd_jobs = ht.JobSet(exe='hadd',
                              copy_exe=False,
                              share_exe_setup=True,
                              filename=os.path.join(log_dir, 'haddSmall.condor'),
                              setup_script="cmssw_setup.sh",
                              out_dir=log_dir, out_file=log_stem + '.out',
                              err_dir=log_dir, err_file=log_stem + '.err',
                              log_dir=log_dir, log_file=log_stem + '.log',
                              cpus=1, memory='100MB', disk='20MB',
                              transfer_hdfs_input=False,
                              hdfs_store=out_dir)

        # Construct final hadded file name
        final_file = os.path.join(out_dir, out_stem + append.format(**fmt_dict) + '.root')
        hadd_output = [final_file]
        hadd_args = hadd_output + calib_output_files

        hadder = ht.Job(name='haddRunCalib',
                        args=hadd_args,
                        input_files=calib_output_files,
                        output_files=hadd_output)

        hadd_jobs.add_job(hadder)

        # Add all jobs to DAG, with necessary dependencies
        # ---------------------------------------------------------------------
        stem = 'runCalib_%s_%s' % (strftime("%H%M%S"), cc.rand_str(3))
        calib_dag = ht.DAGMan(filename=os.path.join(log_dir, '%s.dag' % stem),
                              status_file=os.path.join(log_dir, '%s.status' % stem))
        for job in runCalib_jobs:
            calib_dag.add_job(job)

        calib_dag.add_job(hadder, requires=[j for j in runCalib_jobs])

        # Check if any of the output files already exists - maybe we mucked up?
        # ---------------------------------------------------------------------
        if not force_submit:
            for f in [final_file] + calib_output_files:
                if os.path.isfile(f):
                    raise RuntimeError('Output file already exists - not submitting.'
                   '\nTo bypass, use -f flag. \nFILE: %s' % f)

        # calib_dag.write()
        calib_dag.submit()
        status_files.append(calib_dag.status_file)

    print 'For all statuses:'
    print 'DAGstatus.py', ' '.join(status_files)
    return status_files
def submit_resolution_dag(pairs_file,
                          max_l1_pt,
                          log_dir,
                          append,
                          pu_bins,
                          eta_bins,
                          common_input_files,
                          force_submit=False):
    """Submit one makeResolutionPlots DAG for one pairs file.

    This will run makeResolutionPlots over exclusive and inclusive eta bins,
    and then finally hadd the results together.

    Parameters
    ----------
    pairs_files : str, optional
        Pairs file to process. Must be full path.

    max_l1_pt : int, optional
        Maximum L1 pt to consider when making plots.

    log_dir : str, optional
        Directory for STDOUT/STDERR/LOG files. Should be on /storage.

    append : str, optional
        String to append to filenames to track various settings (e.g. PU bin).

    pu_bins : list[list[int, int]], optional
        List of PU bin edges.

    eta_bins : list[float], optional
        List of eta bin edges, including upper edge of last bin.

    force_submit : bool, optional
        If True, forces job submission even if proposed output files
        already exists.
        Oherwise, program quits before submission.

    """
    cc.check_file_exists(pairs_file)

    # Setup output directory for res* files
    # e.g. if pairs file in DATASET/pairs/pairs.root
    # then output goes in DATASET/resolution/
    out_dir = os.path.dirname(os.path.dirname(pairs_file))
    out_dir = os.path.join(out_dir, 'resolution')
    cc.check_create_dir(out_dir, info=True)

    # Stem for output filename
    out_stem = os.path.splitext(os.path.basename(pairs_file))[0]
    out_stem = out_stem.replace("pairs_", "res_")

    # Loop over PU bins
    # ---------------------------------------------------------------------
    pu_bins = pu_bins or [[-99, 999]]  # set ridiculous limits if no cut on PU
    status_files = []
    for (pu_min, pu_max) in pu_bins:

        log_stem = 'res.$(cluster).$(process)'
        res_jobs = ht.JobSet(exe='python',
                             copy_exe=False,
                             filename='submit_resolution.condor',
                             setup_script='worker_setup.sh',
                             share_exe_setup=True,
                             out_dir=log_dir,
                             out_file=log_stem + '.out',
                             err_dir=log_dir,
                             err_file=log_stem + '.err',
                             log_dir=log_dir,
                             log_file=log_stem + '.log',
                             cpus=1,
                             memory='100MB',
                             disk='100MB',
                             transfer_hdfs_input=False,
                             common_input_files=common_input_files,
                             hdfs_store=out_dir)

        # For creating filenames later
        fmt_dict = dict(puMin=pu_min, puMax=pu_max, maxL1Pt=max_l1_pt)

        # Hold all output filenames
        res_output_files = []

        # Add exclusive eta bins to this JobSet
        for ind, (eta_min, eta_max) in enumerate(pairwise(eta_bins)):
            out_file = out_stem + "_%d" % ind + append.format(
                **fmt_dict) + '.root'
            out_file = os.path.join(out_dir, out_file)
            res_output_files.append(out_file)

            job_args = [
                'makeResolutionPlots.py',
                pairs_file,
                out_file,
                '--excl',  #'--maxPt', max_l1_pt,
                #'--PUmin', pu_min, '--PUmax', pu_max,
                '--etaInd',
                ind
            ]

            res_job = ht.Job(name='res_%d' % ind,
                             args=job_args,
                             input_files=[pairs_file],
                             output_files=[out_file])

            res_jobs.add_job(res_job)

        # Add inclusive bins (central, forward, all)
        # remove the [0:1] to do all - currently central only 'cos HF broke
        for incl in ['central', 'forward', 'all'][0:1]:
            out_file = out_stem + "_%s" % incl + append.format(
                **fmt_dict) + '.root'
            out_file = os.path.join(out_dir, out_file)
            res_output_files.append(out_file)

            job_args = [
                'makeResolutionPlots.py', pairs_file, out_file, '--incl'
            ]  #, '--maxPt', max_l1_pt,
            # '--PUmin', pu_min, '--PUmax', pu_max]
            if incl != 'all':
                job_args.append('--%s' % incl)

            res_job = ht.Job(name='res_%s' % incl,
                             args=job_args,
                             input_files=[pairs_file],
                             output_files=[out_file])

            res_jobs.add_job(res_job)

        # Add hadd jobs
        # ---------------------------------------------------------------------
        log_stem = 'resHadd.$(cluster).$(process)'

        hadd_jobs = ht.JobSet(exe='hadd',
                              copy_exe=False,
                              filename='haddSmall.condor',
                              setup_script="cmssw_setup.sh",
                              share_exe_setup=True,
                              out_dir=log_dir,
                              out_file=log_stem + '.out',
                              err_dir=log_dir,
                              err_file=log_stem + '.err',
                              log_dir=log_dir,
                              log_file=log_stem + '.log',
                              cpus=1,
                              memory='100MB',
                              disk='20MB',
                              transfer_hdfs_input=False,
                              hdfs_store=out_dir)

        # Construct final hadded file name
        final_file = os.path.join(
            out_dir, out_stem + append.format(**fmt_dict) + '.root')
        hadd_output = [final_file]
        hadd_args = hadd_output + res_output_files

        hadder = ht.Job(name='haddRes',
                        args=hadd_args,
                        input_files=res_output_files,
                        output_files=hadd_output)

        hadd_jobs.add_job(hadder)

        # Add all jobs to DAG, with necessary dependencies
        # ---------------------------------------------------------------------
        stem = 'res_%s_%s' % (strftime("%H%M%S"), cc.rand_str(3))
        res_dag = ht.DAGMan(filename='%s.dag' % stem,
                            status_file='%s.status' % stem)
        for job in res_jobs:
            res_dag.add_job(job)

        res_dag.add_job(hadder, requires=[j for j in res_jobs])

        # Check if any of the output files already exists - maybe we mucked up?
        # ---------------------------------------------------------------------
        if not force_submit:
            for f in [final_file] + res_output_files:
                if os.path.isfile(f):
                    print 'ERROR: output file already exists - not submitting'
                    print 'FILE:', f
                    return 1

        # res_dag.write()
        res_dag.submit()
        status_files.append(res_dag.status_file)

    print 'For all statuses:'
    print 'DAGstatus.py', ' '.join(status_files)
Exemple #4
0
def submit_runCalib_dag(pairs_file,
                        log_dir,
                        append,
                        pu_bins,
                        eta_bins,
                        common_input_files,
                        force_submit=False):
    """Submit one runCalibration DAG for one pairs file.

    This will run runCalibration over exclusive and inclusive eta bins,
    and then finally hadd the results together.

    Parameters
    ----------
    pairs_files : str, optional
        Pairs file to process. Must be full path.

    log_dir : str, optional
        Directory for STDOUT/STDERR/LOG files. Should be on /storage.

    append : str, optional
        String to append to filenames to track various settings (e.g. PU bin).

    pu_bins : list[list[int, int]], optional
        List of PU bin edges.

    eta_bins : list[float], optional
        List of eta bin edges, including upper edge of last bin.

    force_submit : bool, optional
        If True, forces job submission even if proposed output files
        already exists.
        Oherwise, program quits before submission.

    """
    cc.check_file_exists(pairs_file)

    # Setup output directory for output* files
    # e.g. if pairs file in DATASET/pairs/pairs.root
    # then output goes in DATASET/output/
    out_dir = os.path.dirname(os.path.dirname(pairs_file))
    out_dir = os.path.join(out_dir, 'output')
    cc.check_create_dir(out_dir, info=True)

    # Stem for output filename
    out_stem = os.path.splitext(os.path.basename(pairs_file))[0]
    out_stem = out_stem.replace("pairs_", "output_")

    # Loop over PU bins
    # ---------------------------------------------------------------------
    pu_bins = pu_bins or [[-99, 999]]  # set ridiculous limits if no cut on PU
    status_files = []
    for (pu_min, pu_max) in pu_bins:
        log.info('**** Doing PU bin %g - %g', pu_min, pu_max)

        log_stem = 'runCalib.$(cluster).$(process)'
        runCalib_jobs = ht.JobSet(exe='python',
                                  copy_exe=False,
                                  filename='submit_runCalib.condor',
                                  setup_script='worker_setup.sh',
                                  share_exe_setup=True,
                                  out_dir=log_dir,
                                  out_file=log_stem + '.out',
                                  err_dir=log_dir,
                                  err_file=log_stem + '.err',
                                  log_dir=log_dir,
                                  log_file=log_stem + '.log',
                                  cpus=1,
                                  memory='100MB',
                                  disk='100MB',
                                  transfer_hdfs_input=False,
                                  common_input_files=common_input_files,
                                  hdfs_store=out_dir)

        # For creating filenames later
        fmt_dict = dict(puMin=pu_min, puMax=pu_max)

        # Hold all output filenames
        calib_output_files = []

        # Add exclusive eta bins to this JobSet
        for ind, (eta_min, eta_max) in enumerate(pairwise(eta_bins)):
            out_file = out_stem + "_%d" % ind + append.format(
                **fmt_dict) + '.root'
            out_file = os.path.join(out_dir, out_file)
            calib_output_files.append(out_file)

            job_args = [
                'runCalibration.py', pairs_file, out_file, "--no-genjet-plots",
                '--stage2', '--no-correction-fit', '--PUmin', pu_min,
                '--PUmax', pu_max, '--etaInd', ind
            ]

            calib_job = ht.Job(name='calib_%d' % ind,
                               args=job_args,
                               input_files=[pairs_file],
                               output_files=[out_file])

            runCalib_jobs.add_job(calib_job)

        # Add hadd jobs
        # ---------------------------------------------------------------------
        log_stem = 'runCalibHadd.$(cluster).$(process)'

        hadd_jobs = ht.JobSet(exe='hadd',
                              copy_exe=False,
                              share_exe_setup=True,
                              filename='haddSmall.condor',
                              setup_script="cmssw_setup.sh",
                              out_dir=log_dir,
                              out_file=log_stem + '.out',
                              err_dir=log_dir,
                              err_file=log_stem + '.err',
                              log_dir=log_dir,
                              log_file=log_stem + '.log',
                              cpus=1,
                              memory='100MB',
                              disk='20MB',
                              transfer_hdfs_input=False,
                              hdfs_store=out_dir)

        # Construct final hadded file name
        final_file = os.path.join(
            out_dir, out_stem + append.format(**fmt_dict) + '.root')
        hadd_output = [final_file]
        hadd_args = hadd_output + calib_output_files

        hadder = ht.Job(name='haddRunCalib',
                        args=hadd_args,
                        input_files=calib_output_files,
                        output_files=hadd_output)

        hadd_jobs.add_job(hadder)

        # Add all jobs to DAG, with necessary dependencies
        # ---------------------------------------------------------------------
        stem = 'runCalib_%s_%s' % (strftime("%H%M%S"), cc.rand_str(3))
        calib_dag = ht.DAGMan(filename=os.path.join(log_dir, '%s.dag' % stem),
                              status_file=os.path.join(log_dir,
                                                       '%s.status' % stem))
        for job in runCalib_jobs:
            calib_dag.add_job(job)

        calib_dag.add_job(hadder, requires=[j for j in runCalib_jobs])

        # Check if any of the output files already exists - maybe we mucked up?
        # ---------------------------------------------------------------------
        if not force_submit:
            for f in [final_file] + calib_output_files:
                if os.path.isfile(f):
                    print 'ERROR: output file already exists - not submitting'
                    print 'FILE:', f
                    return 1

        # calib_dag.write()
        calib_dag.submit()
        status_files.append(calib_dag.status_file)

    print 'For all statuses:'
    print 'DAGstatus.py', ' '.join(status_files)
    return status_files