Exemple #1
0
    )
    parser.add_argument(
        '-v',
        '--verbose',
        dest='verbose',
        action='store_true',
        default=False,
        required=False,
        help='R|Verbose output',
    )
    args = parser.parse_args()
    era = args.era
    analysis_type = args.type

    if args.verbose:
        logging.getLogger().setLevel(logging.DEBUG)

    cmssw_base = os.environ['CMSSW_BASE']
    model = NonResonantModel()
    coef_file = os.path.join(
        cmssw_base,
        "src/HHStatAnalysis/AnalyticalModels/data/coefficientsByBin_extended_3M_costHHSim_19-4.txt"
    )
    assert (os.path.isfile(coef_file))
    coefs = model.ReadCoefficients(coef_file)
    hist_file = os.path.join(
        cmssw_base,
        "src/Support/NonResonant/Distros_5p_SM3M_sumBenchJHEP_13TeV_19-4.root")
    assert (os.path.isfile(hist_file))
    hist_title = "H1bin4"
  type = str, nargs = '+', dest = 'extension', metavar ='ext', required = False,
  choices = [ 'png', 'pdf' ], default = [ 'png', 'pdf' ],
  help = 'R|Extension of the output files',
)
parser.add_argument('-f', '--force',
  dest = 'force', action = 'store_true', default = False,
  help = 'R|Create the output directory if it does not exist',
)
parser.add_argument('-v', '--verbose',
  dest = 'verbose', action = 'store_true', default = False,
  help = 'R|Enable verbose output',
)

args = parser.parse_args()

logging.getLogger().setLevel(logging.DEBUG if args.verbose else logging.INFO)

pattern = args.input
if '{sample_name}' not in pattern:
  raise ValueError('No {sample_name} found in pattern %s' % pattern)

input_dir = os.path.dirname(pattern)
if not hdfs.isdir(input_dir):
  raise ValueError('No such input directory: %s' % input_dir)

if args.era == '2017':
  from tthAnalysis.HiggsToTauTau.samples.tthAnalyzeSamples_2017 import samples_2017 as samples
  from tthAnalysis.HiggsToTauTau.analysisSettings import lumi_2017 as lumi
  samples_to_sum = samples_to_sum_2017

  samples_lut = {}
Exemple #3
0
 def unmute(self):
     logging.getLogger().setLevel(logging.DEBUG)
Exemple #4
0
 def mute(self):
     logging.getLogger().setLevel(logging.INFO)
def validate(output_dir, verbose=False):
    '''Validates the job execution carried out by dump_rle_parallel()
  Args:
    output_dir: string, The directory where all RLE files are stored
    verbose:    bool,   Enable verbose output

  Returns:
    None

  The validation is quite basic: the program will loop over the subdirectories of output_dir,
  matches them against the dictionary entries specified by sample variable and counts the number
  of lines in each RLE file. If the number of files doesn't match to the number of entries in
  the corresponding ROOT file, the user will be notified about such discrepancies.

  In principle, the script could also print relevant commands to fix the issues (and dump them
  to an easily executable file) but let's leave it for another time.
  '''

    if verbose:
        logging.getLogger().setLevel(logging.DEBUG)

    root_file_regex = re.compile('^tree_(\d+).root$')
    file_dict = {k: [] for k in ['excess', 'missing', 'corrupted']}

    try:

        for s_key, s_value in samples.iteritems():
            sample_name = s_value['process_name_specific']
            sample_dir = os.path.join(output_dir, sample_name)
            if os.path.isdir(sample_dir):
                logging.debug("Found sample directory {sample_dir}".format(
                    sample_dir=sample_dir))

                #NB! assume that there are no secondary paths in the dictionary (hence index 0!)
                sample_path_dict = s_value['local_paths'][0]
                sample_path = sample_path_dict['path']
                blacklist = sample_path_dict['blacklist']
                for sample_subdir in os.listdir(sample_path):
                    sample_subpath_idx = -1
                    try:
                        sample_subpath_idx = int(sample_subdir)
                    except ValueError:
                        continue
                    if sample_subpath_idx < 0:
                        raise ValueError("Internal error")
                    sample_subpath = os.path.join(sample_path, sample_subdir)
                    logging.debug(
                        "Processing sample subdirectory {sample_subpath}".
                        format(sample_subpath=sample_subpath))

                    for sample_file in os.listdir(sample_subpath):
                        sample_file_fullpath = os.path.join(
                            sample_subpath, sample_file)
                        if not sample_file.endswith(
                                '.root') or not os.path.isfile(
                                    sample_file_fullpath):
                            continue

                        root_file_regex_match = root_file_regex.search(
                            sample_file)
                        if not root_file_regex_match:
                            continue

                        root_file_idx = int(root_file_regex_match.group(1))
                        expected_rle_file_basename = '{root_file_idx}.txt'.format(
                            root_file_idx=root_file_idx)
                        expected_rle_file = os.path.join(
                            sample_dir, expected_rle_file_basename)
                        file_dict_entry = (expected_rle_file,
                                           sample_file_fullpath)
                        if root_file_idx in blacklist:
                            if os.path.isfile(expected_rle_file):
                                logging.warning(
                                    'Found RLE file {rle_file} (corresponding to blacklisted {root_file}) '
                                    'which you ought to delete'.format(
                                        rle_file=expected_rle_file,
                                        root_file=sample_file_fullpath,
                                    ))
                            file_dict['excess'].append(file_dict_entry)
                            continue

                        if not os.path.isfile(expected_rle_file):
                            logging.warning(
                                'Missing RLE file {rle_file} (corresponding to {root_file})'
                                .format(
                                    rle_file=expected_rle_file,
                                    root_file=sample_file_fullpath,
                                ))
                            file_dict['missing'].append(file_dict_entry)
                            continue
                        nof_rle_events = raw_linecount(expected_rle_file)
                        if nof_rle_events == 1 and os.path.getsize(
                                expected_rle_file) == 1:
                            # the RLE file contains only a newline, hence no events
                            nof_rle_events = 0

                        root_file = ROOT.TFile(sample_file_fullpath, 'read')
                        root_tree = root_file.Get('tree')
                        nof_entries = root_tree.GetEntries()

                        nof_events_diff = nof_rle_events - nof_entries
                        if nof_events_diff < 0:
                            logging.error(
                                'Missing {nof_events} events in {rle_filename} (corresponding to {sample_file}): '
                                'expected {expected}, got {actual}'.format(
                                    nof_events=abs(nof_events_diff),
                                    rle_filename=expected_rle_file,
                                    sample_file=sample_file_fullpath,
                                    expected=nof_entries,
                                    actual=nof_rle_events,
                                ))
                            file_dict['corrupted'].append(file_dict_entry)
                        elif nof_events_diff > 0:
                            logging.error(
                                'Got {nof_events} more event than expected in {rle_filename} (corresponding '
                                'to {sample_file}): expected {expected}, got {actual}'
                                .format(
                                    nof_events=nof_events_diff,
                                    rle_filename=expected_rle_file,
                                    sample_file=sample_file_fullpath,
                                    expected=nof_entries,
                                    actual=nof_rle_events,
                                ))
                            file_dict['corrupted'].append(file_dict_entry)
                        else:
                            logging.debug(
                                'File {rle_filename} (corresponding to {sample_file}) looks OK'
                                .format(
                                    rle_filename=expected_rle_file,
                                    sample_file=sample_file_fullpath,
                                ))

    except KeyboardInterrupt:
        pass

    if any(map(bool, file_dict.values())):
        logging.info('Validation finished with errors')
        for key in file_dict.keys():
            if file_dict[key]:
                logging.info('Number of {key} RLE files: {nof_key}'.format(
                    key=key, nof_key=len(file_dict[key])))
                for entry in file_dict[key]:
                    logging.info('{rle_file} <=> {sample_file}'.format(
                        rle_file=entry[0], sample_file=entry[1]))
    else:
        logging.info('Validation finished successfully')
    return
def dump_rle_parallel(output_dir,
                      rle_branchNames,
                      treeName,
                      nof_files=100,
                      force=False,
                      test=False,
                      verbose=False,
                      sample='',
                      tmp_dir=''):
    '''Dumps RLE numbers ,,in parallel''
  Args:
    output_dir:      string, Path to the directory where the RLE files will be stored
    rle_branchNames: dict { string : string }, Specifies the run, lumi and event branch names
    treeName:        string,                   Name of the TTree
    nof_files:       int,                      Number of files to be processed by one sbatch jobs
    force:           bool,                     If True, creates `output_dir` if it's not there
    test:            bool,                     If True, create jobs scripts but do not submit them to SLURM
    verbose:         bool,                     If True, prints lots of information to standard output
    sample:          string,                   (optional) sample name; if the sample name is not specified,
                                               all samples will be processed

  Returns:
    int array, List of sbatch job IDs that were submitted to SLURM
               This list can be used in checking if the jobs that were submitted in this routine are finished or not

  The method does the following things:
    1) loops over sample entries in 2016 dictionary (default) or selects only one sample (specified by `sample`)
    2) loops over all root files under sample directory and arranges them into chunks specified by `nof_files`
    3) creates a Python script and a Bash script which loops over the entries in the file
    4) submits each job to SLURM, unless `test` is True
    5) returns a list of sbatch job IDs that were assigned to each job
  '''
    if verbose:
        logging.getLogger().setLevel(logging.DEBUG)

    if not os.path.isdir(output_dir):
        if not force:
            logging.error("Directory '{output_dir}' does not exist".format(
                output_dir=output_dir, ))
            sys.exit(1)
        else:
            logging.debug(
                "Creating directory '{output_dir}' since it's missing".format(
                    output_dir=output_dir, ))

    # let's make a temporary directories
    output_dir_tmp = os.path.join(output_dir,
                                  "tmp") if not tmp_dir else tmp_dir
    if not create_dir_if_not_exist(output_dir_tmp): sys.exit(1)
    output_dir_tmp_sh = os.path.join(output_dir_tmp, "sh")
    output_dir_tmp_py = os.path.join(output_dir_tmp, "py")
    output_dir_tmp_log = os.path.join(output_dir_tmp, "log")
    if not create_dir_if_not_exist(output_dir_tmp_sh): sys.exit(1)
    if not create_dir_if_not_exist(output_dir_tmp_py): sys.exit(1)
    if not create_dir_if_not_exist(output_dir_tmp_log): sys.exit(1)
    scratch_dir = "/scratch/{user_name}/dump_rle".format(
        user_name=getpass.getuser())

    idx = lambda x: int(x[x.rfind('_') + 1:x.rfind('.')])
    tree_pattern = re.compile("tree_\d+.root")

    jobId = 0
    root_files, remote_output, local_output = [], [], []

    found_sample_name = False
    sbatch_job_ids = []
    for s_key, s_value in samples.iteritems():
        sample_name = s_value['process_name_specific']
        if sample and sample_name != sample:
            continue
        found_sample_name = True

        sample_path = s_value['local_paths'][0]['path']
        logging.debug("Processing sample '{sample_name}'".format(
            sample_name=sample_name, ))

        output_dir_parent = os.path.join(output_dir, sample_name)
        if not os.path.isdir(output_dir_parent):
            os.makedirs(output_dir_parent)

        for sample_subdir_basename in os.listdir(sample_path):
            sample_subdir = os.path.join(sample_path, sample_subdir_basename)

            for rootfile_basename in os.listdir(sample_subdir):
                tree_match = tree_pattern.match(rootfile_basename)
                if not tree_match:
                    continue

                rootfile_idx = idx(rootfile_basename)
                root_files.append(
                    os.path.join(sample_subdir, rootfile_basename))
                local_output.append(
                    os.path.join(output_dir_parent,
                                 "{i}.txt".format(i=rootfile_idx)))
                remote_output.append(
                    os.path.join(scratch_dir, str(jobId), sample_name,
                                 os.path.basename(local_output[-1])))

                if len(root_files) == nof_files:
                    sh_path = os.path.join(output_dir_tmp_sh,
                                           "{i}.sh".format(i=jobId))
                    py_path = os.path.join(output_dir_tmp_py,
                                           "{i}.py".format(i=jobId))
                    log_path = os.path.join(output_dir_tmp_log,
                                            "{i}.log".format(i=jobId))
                    scratch_job_dir = os.path.join(
                        os.path.join(scratch_dir, str(jobId)))
                    sbatch_job_id = bake_job(
                        sh_path,
                        rle_branchNames,
                        treeName,
                        py_path,
                        scratch_job_dir,
                        zip(root_files, remote_output, local_output),
                        log_path,
                        not test,
                    )
                    if sbatch_job_id:
                        sbatch_job_ids.append(sbatch_job_id)
                    logging.debug("Creating job {jobId}".format(jobId=jobId))
                    root_files, remote_output, local_output = [], [], []
                    jobId += 1

    if sample and not found_sample_name:
        logging.error(
            "Sample name '{sample_name}' does not exist in the sample dictionary"
            .format(sample_name=sample))
        sys.exit(1)

    if root_files:
        sh_path = os.path.join(output_dir_tmp_sh, "{i}.sh".format(i=jobId))
        py_path = os.path.join(output_dir_tmp_py, "{i}.py".format(i=jobId))
        log_path = os.path.join(output_dir_tmp_log, "{i}.log".format(i=jobId))
        scratch_job_dir = os.path.join(os.path.join(scratch_dir, str(jobId)))
        sbatch_job_id = bake_job(
            sh_path,
            rle_branchNames,
            treeName,
            py_path,
            scratch_job_dir,
            zip(root_files, remote_output, local_output),
            log_path,
            not test,
        )
        if sbatch_job_id:
            sbatch_job_ids.append(sbatch_job_id)
        logging.debug("Creating job {jobId}".format(jobId=jobId))

    logging.debug("Done!")
    return map(int, sbatch_job_ids)