Exemple #1
0
def dump_rle(input_file,
             output_file,
             tree_name='Events',
             run_br='run',
             lumi_br='luminosityBlock',
             event_br='event'):
    with open(output_file, 'w') as f:
        ch_root = ROOT.TChain(tree_name)
        ch_root.AddFile(input_file)

        run_a = array.array('I', [0])
        lumi_a = array.array('I', [0])
        evt_a = array.array('L', [0])

        ch_root.SetBranchAddress(run_br, run_a)
        ch_root.SetBranchAddress(lumi_br, lumi_a)
        ch_root.SetBranchAddress(event_br, evt_a)

        nof_entries = ch_root.GetEntries()
        rle_i_arr = []
        for i in range(nof_entries):
            ch_root.GetEntry(i)
            rle_i_arr.append(':'.join(map(str,
                                          [run_a[0], lumi_a[0], evt_a[0]])))

        f.write("{rle_lines}\n".format(rle_lines='\n'.join(rle_i_arr)))

    logging.debug("Wrote {nof_bytes} kB to {filename}".format(
        nof_bytes=os.path.getsize(output_file) / 1000,
        filename=output_file,
    ))
    return
Exemple #2
0
    def memJobList(self, inputFileList, rle_whitelist):
        '''
        Args:
          inputFileList:{ int, array of strings }; i.e. fileset* ID and the list of files

        * if the script were to generate configuration files, this number would correspond to job ID

        Returns:
          { int : { str : int, str : [str, str, ...], str : [int, int] } }
            |        |          |                      |
         job id  "fileset_id" "input_fileset"     "event_range"

        The function reads a given set of files and determines the event range
        '''
        memJobDict = {}
        jobId = 0
        apply_rle_filter = bool(self.rle_filter_file)
        for filesetId, inputFileSet in inputFileList.iteritems():
            memJobDict_common = { 'fileset_id' : filesetId, 'input_fileset' : inputFileSet }
            print("Processing file %s" % inputFileSet)
            ch = ROOT.TChain(self.treeName)
            for fn in inputFileSet:
                # chaining a file
                logging.debug("Processing file {fileName}".format(fileName = fn))
                ch.AddFile(fn)

            nof_entries = ch.GetEntries()

            memJobDict_common['nof_entries'] = nof_entries
            if nof_entries == 0:
                jobId += 1
                memJobDict[jobId] = dict({
                    'event_range'     : [0, 0],
                    'nof_int'         : 0,
                    'nof_int_pass'    : 0,
                    'nof_events_pass' : 0,
                    'nof_zero'        : 0,
                }, **memJobDict_common)
                continue

            current_pos = 0
            evt_ranges = []

            counter, counter_arr = 0, []
            nof_events_pass_counter, nof_events_pass   = 0, []
            nof_int_pass_counter,    nof_int_pass      = 0, []
            nof_zero_integrations,   nof_events_zero   = 0, []
            whitelist_all,           whitelist_running = [], []

            run                    = array.array('I', [0])
            luminosityBlock        = array.array('I', [0])
            event                  = array.array('L', [0])
            maxPermutations_addMEM = array.array('i', [0])
            ch.SetBranchAddress("run",             run)
            ch.SetBranchAddress("luminosityBlock", luminosityBlock)
            ch.SetBranchAddress("event",           event)
            if self.maxPermutations_branchName is not None and self.maxPermutations_branchName != "":
              ch.SetBranchAddress(self.maxPermutations_branchName, maxPermutations_addMEM)
            else:
              maxPermutations_addMEM[0] = 1

            for i in range(nof_entries):
                ch.GetEntry(i)
                if i > 0 and i % 10000 == 0:
	            print(" Processing event %i/%i" % (i, nof_entries))
                    logging.debug("Processing event %i/%i" % (i, nof_entries))

                rle = ':'.join(map(lambda nr: str(nr[0]), [ run, luminosityBlock, event ]))

                nof_integrations = maxPermutations_addMEM[0]
                if apply_rle_filter:
                    if rle in rle_whitelist:
                        if not (nof_integrations > 0):
                            logging.error("Expected non-zero # integrations in event {}, but got {}".format(rle, nof_integrations))
                        nof_integrations = 1
                    else:
                        nof_integrations = 0

                if nof_integrations < 0:
                    nof_integrations = 0

                if nof_integrations >= 1:
                    nof_events_pass_counter += 1
                    nof_int_pass_counter += nof_integrations
                else:
                    nof_zero_integrations += 1

                if nof_integrations > self.mem_integrations_per_job:
                    raise ValueError("Too many nof_integrations = %d in file(s) %s at %d:%d:%d" %
                                     (nof_integrations, ', '.join(inputFileSet), ch.run, ch.lumi, ch.evt))

                if (counter + nof_integrations) > self.mem_integrations_per_job:
                    if evt_ranges:
                        evt_ranges.append([evt_ranges[-1][1], current_pos])
                    else:
                        evt_ranges.append([0, current_pos])
                    counter_arr.append(counter)
                    counter = 0

                    nof_events_pass.append(nof_events_pass_counter)
                    nof_events_pass_counter = 0

                    nof_int_pass.append(nof_int_pass_counter)
                    nof_int_pass_counter = 0

                    nof_events_zero.append(nof_zero_integrations)
                    nof_zero_integrations = 0

                    if apply_rle_filter:
                        whitelist_all.append(whitelist_running)
                        whitelist_running = []

                if rle in rle_whitelist:
                    whitelist_running.append(rle)

                counter += nof_integrations
                current_pos += 1

            if counter <= self.mem_integrations_per_job and counter >= 0:
                if evt_ranges:
                    evt_ranges.append([evt_ranges[-1][1], int(nof_entries)])
                else:
                    evt_ranges.append([0, int(nof_entries)])
                counter_arr.append(counter)
                nof_events_pass.append(nof_events_pass_counter)
                nof_int_pass.append(nof_int_pass_counter)
                nof_events_zero.append(nof_zero_integrations)
                if apply_rle_filter:
                    whitelist_all.append(whitelist_running)

            # ensure that the event ranges won't overlap (i.e. there won't be any double-processing of any event)
            evt_ranges_cat = []
            for v in [range(x[0], x[1]) for x in evt_ranges]:
              evt_ranges_cat += v
            assert(evt_ranges_cat == range(nof_entries))
            assert(bool(evt_ranges))

            for i in range(len(evt_ranges)):
              if self.max_jobs_per_sample == -1 or jobId < self.max_jobs_per_sample:
                jobId += 1
                memJobDict[jobId] = dict({
                    'event_range'     : evt_ranges[i],
                    'nof_int'         : counter_arr[i],
                    'nof_int_pass'    : nof_int_pass[i],
                    'nof_events_pass' : nof_events_pass[i],
                    'nof_zero'        : nof_events_zero[i],
                    'whitelist'       : whitelist_all[i] if apply_rle_filter else [],
                }, **memJobDict_common)
                # we now have all event ranges per one file, let's add them to the dictionary

            del ch
        return memJobDict
Exemple #3
0
                        sample_path, '000%d' % (file_idx / 1000),
                        'tree_{i}.root'.format(i=file_idx))
                    rles[rle].append(grep_result)
        else:
            # instead of forming a list of files let's loop over the subfolders and the files therein instead
            logging.debug('Looping over the files in {sample_path}'.format(
                sample_path=sample_path))
            for subdir in hdfs.listdir(sample_path):
                logging.debug(
                    'Found subdirectory {subdir}'.format(subdir=subdir))
                for rootfile in hdfs.listdir(subdir):
                    logging.debug("Processing file '{rootfile}'".format(
                        rootfile=rootfile, ))

                    # open the file
                    ch_root = ROOT.TChain("Events")
                    ch_root.AddFile(rootfile)

                    run_a = array.array('I', [0])
                    lumi_a = array.array('I', [0])
                    evt_a = array.array('L', [0])

                    ch_root.SetBranchAddress("run", run_a)
                    ch_root.SetBranchAddress("luminosityBlock", lumi_a)
                    ch_root.SetBranchAddress("event", evt_a)

                    nof_entries = ch_root.GetEntries()
                    for i in range(nof_entries):
                        ch_root.GetEntry(i)
                        rle_i = ':'.join(
                            map(str, [run_a[0], lumi_a[0], evt_a[0]]))
Exemple #4
0
    def memJobList(self, inputFileList):
        '''
        Args:
          inputFileList:{ int, array of strings }; i.e. fileset* ID and the list of files

        * if the script were to generate configuration files, this number would correspond to job ID

        Returns:
          { int : { str : int, str : [str, str, ...], str : [int, int] } }
            |        |          |                      |
         job id  "fileset_id" "input_fileset"     "event_range"

        The function reads a given set of files and determines the event range
        '''
        memJobDict = {}
        jobId = 0
        for filesetId, inputFileSet in inputFileList.iteritems():
            memJobDict_common = {
                'fileset_id': filesetId,
                'input_fileset': inputFileSet
            }
            ch = ROOT.TChain(self.treeName)
            for fn in inputFileSet:
                # chaining a file
                logging.debug("Processing file {fileName}".format(fileName=fn))
                ch.AddFile(fn)

            nof_entries = ch.GetEntries()

            memJobDict_common['nof_entries'] = nof_entries
            if nof_entries == 0:
                jobId += 1
                memJobDict[jobId] = dict(
                    {
                        'event_range': [0, 0],
                        'nof_int': 0,
                        'nof_int_pass': 0,
                        'nof_events_pass': 0,
                        'nof_zero': 0,
                    }, **memJobDict_common)
                continue

            current_pos = 0
            evt_ranges = []

            counter, counter_arr = 0, []
            nof_events_pass_counter, nof_events_pass = 0, []
            nof_int_pass_counter, nof_int_pass = 0, []
            nof_zero_integrations, nof_events_zero = 0, []

            maxPermutations_addMEM = array.array('i', [0])
            ch.SetBranchAddress(self.maxPermutations_branchName,
                                maxPermutations_addMEM)

            for i in range(nof_entries):
                ch.GetEntry(i)
                if i > 0 and i % 10000 == 0:
                    logging.debug("Processing event %i/%i" % (i, nof_entries))

                nof_integrations = maxPermutations_addMEM[0]
                if nof_integrations < 0:
                    nof_integrations = 0

                if nof_integrations >= 1:
                    nof_events_pass_counter += 1
                    nof_int_pass_counter += nof_integrations
                else:
                    nof_zero_integrations += 1

                if nof_integrations > self.mem_integrations_per_job:
                    raise ValueError(
                        "Too many nof_integrations = %d in file(s) %s at %d:%d:%d"
                        % (nof_integrations, ', '.join(inputFileSet), ch.run,
                           ch.lumi, ch.evt))

                if (counter +
                        nof_integrations) > self.mem_integrations_per_job:
                    if evt_ranges:
                        evt_ranges.append([evt_ranges[-1][1], current_pos])
                    else:
                        evt_ranges.append([0, current_pos])
                    counter_arr.append(counter)
                    counter = 0

                    nof_events_pass.append(nof_events_pass_counter)
                    nof_events_pass_counter = 0

                    nof_int_pass.append(nof_int_pass_counter)
                    nof_int_pass_counter = 0

                    nof_events_zero.append(nof_zero_integrations)
                    nof_zero_integrations = 0
                counter += nof_integrations
                current_pos += 1

            if counter <= self.mem_integrations_per_job and counter >= 0:
                if evt_ranges:
                    evt_ranges.append([evt_ranges[-1][1], int(nof_entries)])
                else:
                    evt_ranges.append([0, int(nof_entries)])
                counter_arr.append(counter)
                nof_events_pass.append(nof_events_pass_counter)
                nof_int_pass.append(nof_int_pass_counter)
                nof_events_zero.append(nof_zero_integrations)

            # ensure that the event ranges won't overlap (i.e. there won't be any double-processing of any event)
            evt_ranges_cat = []
            for v in [range(x[0], x[1]) for x in evt_ranges]:
                evt_ranges_cat += v
            assert (evt_ranges_cat == range(nof_entries))
            assert (bool(evt_ranges))

            for i in range(len(evt_ranges)):
                jobId += 1
                memJobDict[jobId] = dict(
                    {
                        'event_range': evt_ranges[i],
                        'nof_int': counter_arr[i],
                        'nof_int_pass': nof_int_pass[i],
                        'nof_events_pass': nof_events_pass[i],
                        'nof_zero': nof_events_zero[i],
                    }, **memJobDict_common)
                # we now have all event ranges per one file, let's add them to the dictionary

        return memJobDict