コード例 #1
0
ファイル: skim_root.py プロジェクト: saswatinandan/tth-htt
def skim(in_filename, out_filename, entry_list, tree_name = "Events"):
  '''
  Args:
    in_filename:  string,    Path to the input ROOT file
    out_filename: string,    Path to the output ROOT file
    entry_list:   int array, List of entries (not RLE numbers!) which are used to select the event
    tree_name:    string,    TTree name (default: tree)

  Returns:
    True,  if none of the entry numbers didn't exceed the actual number of entries in the ROOT file
    False, otherwise
  '''
  logging.debug("Processing file {in_filename}".format(in_filename = in_filename))
  f_in = ROOT.TFile(in_filename, "read")
  t_in = f_in.Get(tree_name)

  nof_entries = t_in.GetEntries()
  if max(entry_list) > nof_entries:
    logging.error("Max entry ID exceeds the number of entries in {root_filename}: {max_entry} > {nof_entries}".format(
      root_filename = in_filename,
      max_entry     = max(entry_list),
      nof_entries   = nof_entries,
    ))
    return False

  f_out = ROOT.TFile(out_filename, "recreate")
  t_out = t_in.CloneTree(0)

  for i in entry_list:
    t_in.GetEntry(i)
    t_out.Fill()

  t_out.AutoSave()
  logging.debug("Saved events to {out_filename}".format(out_filename = out_filename))
  return True
コード例 #2
0
ファイル: skim_root.py プロジェクト: saswatinandan/tth-htt
def get_rle(in_filename, tree_name = "tree"):
  '''Fetches all RLE numbers in a given file
  Args:
    in_filename: string, Path to the ROOT file from which the RLE numbers are fetched
    tree_name:   string, TTree name (default: tree)

  Returns:
    string array, List of RLE numbers (in the format returned by join_rle())

  The function expects the input file to exist in the file system.
  '''
  rle_list = []
  f_in = ROOT.TFile(in_filename, "read")
  t_in = f_in.Get(tree_name)

  run  = array.array('I', [0])
  lumi = array.array('I', [0])
  evt  = array.array('L', [0])

  t_in.SetBranchAddress("run", run)
  t_in.SetBranchAddress("luminosityBlock", lumi)
  t_in.SetBranchAddress("event", evt)

  nof_entries = t_in.GetEntries()
  for i in range(nof_entries):
    t_in.GetEntry(i)
    rle_list.append(join_rle(run, lumi, evt))

  return rle_list
コード例 #3
0
def check_that_histogram_is_not_zombie(input_file):
    print("<check_that_histogram_is_not_zombie>: input file = '%s'" %
          input_file)

    root_tfile = ROOT.TFile(input_file, "read")

    if root_tfile.IsZombie():  # MP: THIS IS NOT WORKING :(
        print("ERROR: Input file '%s' is zombie !!" % input_file)
        sys.exit(1)
コード例 #4
0
def is_file_ok(output_file_name, validate_outputs=True, min_file_size=20000):
    if not (output_file_name and os.path.exists(output_file_name)):
        return False

    logging.info("Output file %s already exists" % output_file_name)

    if not output_file_name.lower().endswith('.root'):
        return True

    command = "rm %s" % output_file_name
    ret_value = False
    if min_file_size > 0:
        output_file_size = os.stat(output_file_name).st_size
        if output_file_size > min_file_size:
            if not validate_outputs:
                ret_value = True
        else:
            logging.info(
                "Deleting output file and resubmitting job because it has size smaller than %d bytes"
                % min_file_size)

    if validate_outputs:
        root_tfile = ROOT.TFile(output_file_name, "read")
        if not root_tfile:
            logging.info("Not a valid ROOT file, deleting it")
        else:
            if root_tfile.IsZombie():
                logging.info(
                    "Output file is corrupted, deleting file and resubmitting job"
                )
            else:
                # Let's open the file via bash as well to see if ROOT tries to recover the file
                open_cmd = "root -b -l -q %s 2>&1 > /dev/null | grep 'trying to recover' | wc -l" % output_file_name
                open_out = run_cmd(open_cmd)
                if open_out.rstrip('\n') != '0':
                    logging.info(
                        "Output file is probably corrupted, deleting file and resubmitting job"
                    )
                else:
                    ret_value = True
            root_tfile.Close()

    if not ret_value:
        run_cmd(command)

    return ret_value
コード例 #5
0
def validate(output_dir, verbose=False):
    '''Validates the job execution carried out by dump_rle_parallel()
  Args:
    output_dir: string, The directory where all RLE files are stored
    verbose:    bool,   Enable verbose output

  Returns:
    None

  The validation is quite basic: the program will loop over the subdirectories of output_dir,
  matches them against the dictionary entries specified by sample variable and counts the number
  of lines in each RLE file. If the number of files doesn't match to the number of entries in
  the corresponding ROOT file, the user will be notified about such discrepancies.

  In principle, the script could also print relevant commands to fix the issues (and dump them
  to an easily executable file) but let's leave it for another time.
  '''

    if verbose:
        logging.getLogger().setLevel(logging.DEBUG)

    root_file_regex = re.compile('^tree_(\d+).root$')
    file_dict = {k: [] for k in ['excess', 'missing', 'corrupted']}

    try:

        for s_key, s_value in samples.iteritems():
            sample_name = s_value['process_name_specific']
            sample_dir = os.path.join(output_dir, sample_name)
            if os.path.isdir(sample_dir):
                logging.debug("Found sample directory {sample_dir}".format(
                    sample_dir=sample_dir))

                #NB! assume that there are no secondary paths in the dictionary (hence index 0!)
                sample_path_dict = s_value['local_paths'][0]
                sample_path = sample_path_dict['path']
                blacklist = sample_path_dict['blacklist']
                for sample_subdir in os.listdir(sample_path):
                    sample_subpath_idx = -1
                    try:
                        sample_subpath_idx = int(sample_subdir)
                    except ValueError:
                        continue
                    if sample_subpath_idx < 0:
                        raise ValueError("Internal error")
                    sample_subpath = os.path.join(sample_path, sample_subdir)
                    logging.debug(
                        "Processing sample subdirectory {sample_subpath}".
                        format(sample_subpath=sample_subpath))

                    for sample_file in os.listdir(sample_subpath):
                        sample_file_fullpath = os.path.join(
                            sample_subpath, sample_file)
                        if not sample_file.endswith(
                                '.root') or not os.path.isfile(
                                    sample_file_fullpath):
                            continue

                        root_file_regex_match = root_file_regex.search(
                            sample_file)
                        if not root_file_regex_match:
                            continue

                        root_file_idx = int(root_file_regex_match.group(1))
                        expected_rle_file_basename = '{root_file_idx}.txt'.format(
                            root_file_idx=root_file_idx)
                        expected_rle_file = os.path.join(
                            sample_dir, expected_rle_file_basename)
                        file_dict_entry = (expected_rle_file,
                                           sample_file_fullpath)
                        if root_file_idx in blacklist:
                            if os.path.isfile(expected_rle_file):
                                logging.warning(
                                    'Found RLE file {rle_file} (corresponding to blacklisted {root_file}) '
                                    'which you ought to delete'.format(
                                        rle_file=expected_rle_file,
                                        root_file=sample_file_fullpath,
                                    ))
                            file_dict['excess'].append(file_dict_entry)
                            continue

                        if not os.path.isfile(expected_rle_file):
                            logging.warning(
                                'Missing RLE file {rle_file} (corresponding to {root_file})'
                                .format(
                                    rle_file=expected_rle_file,
                                    root_file=sample_file_fullpath,
                                ))
                            file_dict['missing'].append(file_dict_entry)
                            continue
                        nof_rle_events = raw_linecount(expected_rle_file)
                        if nof_rle_events == 1 and os.path.getsize(
                                expected_rle_file) == 1:
                            # the RLE file contains only a newline, hence no events
                            nof_rle_events = 0

                        root_file = ROOT.TFile(sample_file_fullpath, 'read')
                        root_tree = root_file.Get('tree')
                        nof_entries = root_tree.GetEntries()

                        nof_events_diff = nof_rle_events - nof_entries
                        if nof_events_diff < 0:
                            logging.error(
                                'Missing {nof_events} events in {rle_filename} (corresponding to {sample_file}): '
                                'expected {expected}, got {actual}'.format(
                                    nof_events=abs(nof_events_diff),
                                    rle_filename=expected_rle_file,
                                    sample_file=sample_file_fullpath,
                                    expected=nof_entries,
                                    actual=nof_rle_events,
                                ))
                            file_dict['corrupted'].append(file_dict_entry)
                        elif nof_events_diff > 0:
                            logging.error(
                                'Got {nof_events} more event than expected in {rle_filename} (corresponding '
                                'to {sample_file}): expected {expected}, got {actual}'
                                .format(
                                    nof_events=nof_events_diff,
                                    rle_filename=expected_rle_file,
                                    sample_file=sample_file_fullpath,
                                    expected=nof_entries,
                                    actual=nof_rle_events,
                                ))
                            file_dict['corrupted'].append(file_dict_entry)
                        else:
                            logging.debug(
                                'File {rle_filename} (corresponding to {sample_file}) looks OK'
                                .format(
                                    rle_filename=expected_rle_file,
                                    sample_file=sample_file_fullpath,
                                ))

    except KeyboardInterrupt:
        pass

    if any(map(bool, file_dict.values())):
        logging.info('Validation finished with errors')
        for key in file_dict.keys():
            if file_dict[key]:
                logging.info('Number of {key} RLE files: {nof_key}'.format(
                    key=key, nof_key=len(file_dict[key])))
                for entry in file_dict[key]:
                    logging.info('{rle_file} <=> {sample_file}'.format(
                        rle_file=entry[0], sample_file=entry[1]))
    else:
        logging.info('Validation finished successfully')
    return
コード例 #6
0
def build_rle_file(rles, output):
    rles_aggregated = collections.OrderedDict()
    region_plus_sys_arr = collections.OrderedDict()
    for channel in rles:
        rles_aggregated[channel] = collections.OrderedDict()
        region_plus_sys_arr[channel] = []
        for region in rles[channel]:
            for sample_name in rles[channel][region]:
                if sample_name not in rles_aggregated[channel]:
                    rles_aggregated[channel][
                        sample_name] = collections.OrderedDict()
                for central_or_shift in rles[channel][region][sample_name]:
                    region_plus_sys = region + '_' + central_or_shift
                    if region_plus_sys not in region_plus_sys_arr:
                        region_plus_sys_arr[channel].append(region_plus_sys)
                    for rle in rles[channel][region][sample_name][
                            central_or_shift]:
                        assert (REGEX_RLE.match(rle))
                        if rle not in rles_aggregated[channel][sample_name]:
                            rles_aggregated[channel][sample_name][rle] = []
                        rles_aggregated[channel][sample_name][rle].append(
                            region_plus_sys)
    output_file = ROOT.TFile(output, 'recreate')
    for channel in rles_aggregated:
        channel_dir = output_file.mkdir(channel)
        channel_dir.cd()
        event_count = 0
        for sample_name in rles_aggregated[channel]:
            tree = ROOT.TTree(sample_name, sample_name)
            run = array.array('I', [0])
            luminosityBlock = array.array('I', [0])
            event = array.array('L', [0])
            region_plus_sys_brs = collections.OrderedDict()
            for region_plus_sys in region_plus_sys_arr[channel]:
                region_plus_sys_brs[region_plus_sys] = array.array('I', [0])
            tree.Branch('run', run, 'run/i')
            tree.Branch('luminosityBlock', luminosityBlock,
                        'luminosityBlock/i')
            tree.Branch('event', event, 'event/l')
            for region_plus_sys in region_plus_sys_brs:
                tree.Branch(region_plus_sys,
                            region_plus_sys_brs[region_plus_sys],
                            '%s/i' % region_plus_sys)
            event_count_sample = len(rles_aggregated[channel][sample_name])
            logging.info(
                'Found a total of {} events in sample {} and channel {}'.
                format(event_count_sample, sample_name, channel))
            event_count += event_count_sample
            for rle in rles_aggregated[channel][sample_name]:
                rle_split = rle.split(':')
                run[0] = int(rle_split[0])
                luminosityBlock[0] = int(rle_split[1])
                event[0] = int(rle_split[2])
                for region_plus_sys in region_plus_sys_brs:
                    region_plus_sys_brs[region_plus_sys][0] = int(
                        region_plus_sys in rles_aggregated[channel]
                        [sample_name][rle])
                tree.Fill()
            output_file.Write()
        logging.info('Found a total of {} events in channel {}'.format(
            event_count, channel))
        logging.info('Wrote file {}'.format(output))