예제 #1
0
    def process(self, events):
        select_output = self.select(events, unc="nominal", shift=None)
        categories = self.categories(select_output)
        output = select_output["output"]

        if categories:
            arrays = self.arrays(ChainMap({}, select_output))
            dataset = self.get_dataset(events)

            weight = select_output["weights"].weight()
            arrays.setdefault(
                "weight",
                np.stack([np.full_like(weight, dataset.id), weight], axis=-1))

            assert all(not a.dtype.hasobject for a in arrays.values())

            if self.dtype:
                arrays = {
                    key: array.astype(self.dtype)
                    for key, array in arrays.items()
                }
        else:
            arrays = {}

        output["arrays"] = dict_accumulator({
            category: dict_accumulator({
                key: array_accumulator(array[cut, ...])
                for key, array in arrays.items()
            })
            for category, cut in categories.items()
        })

        return output
예제 #2
0
def _work_function_nanoaod(item, processor_instance, flatten=False, savemetrics=False, mmap=False):
    if processor_instance == 'heavy':
        item, processor_instance = item
    if not isinstance(processor_instance, ProcessorABC):
        processor_instance = cloudpickle.loads(lz4f.decompress(processor_instance))
    if mmap:
        localsource = {}
    else:
        opts = dict(uproot.FileSource.defaults)
        opts.update({'parallel': None})

        def localsource(path):
            return uproot.FileSource(path, **opts)

    file = uproot.open(item.filename, localsource=localsource)
    tree = file[item.treename]
    df = LazyDataFrame(tree, item.chunksize, item.index, flatten=flatten)
    # For NanoAOD, we have to look at the "Runs" TTree for info such as weight sums
    # The different cases in the loop represent the different formats and accordingly
    # different ways of dealing with the provided values.
    for name in map(lambda x: x.decode('utf-8'), file['Runs'].keys()):
        arr = file['Runs'][name].array()
        if name.startswith('n'):
            # Check that all instances are the same, then save that value
            tmp = set([])
            for entry in arr:
                tmp.add(entry)
            assert(len(tmp)==1)
            df[name] = list(tmp)[0]
        elif name in ['genEventCount','genEventSumw','genEventSumw2']:
            # One entry per run -> just sum
            df[name] = int(item.index==0) * arr.sum()
        elif name in ['LHEScaleSumw','LHEPdfSumw']:
            # Sum per variation, conserve number of variations
            tmp = 0 * arr[0]
            for i in range(len(arr)):
                for j in range(len(arr[i])):
                    tmp[j] += arr[i][j]
            df[name] = int(item.index==0) * tmp
    ### END NANOAOD
    df['dataset'] = item.dataset
    tic = time.time()
    out = processor_instance.process(df)
    toc = time.time()
    metrics = dict_accumulator()
    if savemetrics:
        if isinstance(file.source, uproot.source.xrootd.XRootDSource):
            metrics['bytesread'] = value_accumulator(int, file.source.bytesread)
            metrics['dataservers'] = set_accumulator({file.source._source.get_property('DataServer')})
        metrics['columns'] = set_accumulator(df.materialized)
        metrics['entries'] = value_accumulator(int, df.size)
        metrics['processtime'] = value_accumulator(float, toc - tic)
    wrapped_out = dict_accumulator({'out': out, 'metrics': metrics})
    file.source.close()
    return wrapped_out
예제 #3
0
 def __init__(self):
     self._acc = dict_accumulator()
     self._acc['sumw'] = processor.defaultdict_accumulator(float)
     self._acc['sumw_scale'] = processor.defaultdict_accumulator(
         empty_array_100)
     self._acc['sumw_pdf'] = processor.defaultdict_accumulator(
         empty_array_100)
예제 #4
0
    def __init__(self, task):
        super().__init__(task)

        self._accumulator["histograms"] = dict_accumulator({
            variable.name: hist_accumulator(
                Hist(
                    self.dataset_axis,
                    self.category_axis,
                    self.syst_axis,
                    hist.axis.Regular(
                        variable.binning[0],
                        variable.binning[1],
                        variable.binning[2],
                        name=variable.name,
                        label=variable.x_title,
                    ),
                    metadata={
                        "name": variable.name,
                        "x_title": variable.x_title,
                        "y_title": variable.y_title,
                    },
                    storage=hist.storage.Weight(),
                ))
            for variable in self.variables
        })
예제 #5
0
    def process(self, events):
        from distributed import worker_client, Variable, Lock

        assert isinstance(self.proc, BaseProcessor)
        assert not isinstance(self.proc, _Preheater)

        s = self.proc.get_dataset(events).data_source
        d = self.prefix + s

        with worker_client(separate_thread=False) as c:
            v = Variable(d, c)
            l = Lock(d, c)

            if l.acquire(blocking=False):
                self.proc.process(events)

                cols = set()
                for col in events.materialized:
                    col = col.replace("_", ".", 1)
                    try:
                        attrgetter(col)(events)
                    except AttributeError:
                        pass
                    else:
                        cols.add(col)
                cols = sorted(cols)
                v.set(cols)
                return dict_accumulator({s: set_accumulator(cols)})
            else:
                cols = v.get()

        for ag in map(attrgetter, cols):
            data = ag(events)
            data = getattr(data, "content", data)
            if callable(getattr(data, "materialize")):
                data.materialize()
        return dict_accumulator({})
예제 #6
0
def _work_function_nanoaod(item, flatten=False, savemetrics=False, mmap=False, **_):
    dataset, fn, treename, chunksize, index, processor_instance = item
    if mmap:
        localsource = {}
    else:
        opts = dict(uproot.FileSource.defaults)
        opts.update({'parallel': None})

        def localsource(path):
            return uproot.FileSource(path, **opts)

    file = uproot.open(fn, localsource=localsource)

    tree = file[treename]
    df = LazyDataFrame(tree, chunksize, index, flatten=flatten)
    for name in file['Runs'].keys():
        name = name.decode('utf-8')
        if index==0:
            df[name] = file['Runs'][name].array()
        else:
            df[name] = 0 * file['Runs'][name].array()
    df['dataset'] = dataset
    tic = time.time()
    out = processor_instance.process(df)
    toc = time.time()
    metrics = dict_accumulator()
    if savemetrics:
        if isinstance(file.source, uproot.source.xrootd.XRootDSource):
            metrics['bytesread'] = value_accumulator(int, file.source.bytesread)
            metrics['dataservers'] = set_accumulator({file.source._source.get_property('DataServer')})
        metrics['columns'] = set_accumulator(df.materialized)
        metrics['entries'] = value_accumulator(int, df.size)
        metrics['processtime'] = value_accumulator(float, toc - tic)
    wrapped_out = dict_accumulator({'out': out, 'metrics': metrics})
    file.source.close()
    return wrapped_out
예제 #7
0
    def __init__(self, task):
        self.publish_message = task.publish_message if task.debug else None
        self.config = task.config_inst
        self.year = task.year
        self.corrections = task.load_corrections()

        self.dataset_axis = hist.axis.StrCategory([],
                                                  name="dataset",
                                                  label="Primary dataset",
                                                  growth=True)
        self.dataset_shift_axis = hist.axis.StrCategory([],
                                                        name="dataset_shift",
                                                        label="Dataset shift",
                                                        growth=True)
        self.category_axis = hist.axis.StrCategory([],
                                                   name="category",
                                                   label="Category selection",
                                                   growth=True)
        self.syst_axis = hist.axis.StrCategory(
            [],
            name="systematic",
            label="Shift of systematic uncertainty",
            growth=True)

        self._accumulator = dict_accumulator(
            n_events=defaultdict_accumulator(int),
            sum_gen_weights=defaultdict_accumulator(float),
            object_cutflow=defaultdict_accumulator(int),
            cutflow=hist_accumulator(
                Hist(
                    self.dataset_axis,
                    self.category_axis,
                    hist.axis.Regular(10,
                                      0,
                                      10,
                                      name="cutflow",
                                      label="Cut index"),
                    storage=hist.storage.Weight(),
                )),
        )
예제 #8
0
def _work_function_nanoaod(item,
                           processor_instance,
                           flatten=False,
                           savemetrics=False,
                           mmap=False,
                           nano=False,
                           cachestrategy=None,
                           skipbadfiles=False,
                           retries=0,
                           xrootdtimeout=None):
    if processor_instance == 'heavy':
        item, processor_instance = item
    if not isinstance(processor_instance, ProcessorABC):
        processor_instance = cloudpickle.loads(
            lz4f.decompress(processor_instance))
    if mmap:
        localsource = {}
    else:
        opts = dict(uproot.FileSource.defaults)
        opts.update({'parallel': None})

        def localsource(path):
            return uproot.FileSource(path, **opts)

    import warnings
    out = processor_instance.accumulator.identity()
    retry_count = 0
    while retry_count <= retries:
        try:
            from uproot.source.xrootd import XRootDSource
            xrootdsource = XRootDSource.defaults
            xrootdsource['timeout'] = xrootdtimeout
            file = uproot.open(item.filename,
                               localsource=localsource,
                               xrootdsource=xrootdsource)
            if nano:
                pass
                # cache = None
                # if cachestrategy == 'dask-worker':
                #     from distributed import get_worker
                #     from .dask import ColumnCache
                #     worker = get_worker()
                #     try:
                #         cache = worker.plugins[ColumnCache.name]
                #     except KeyError:
                #         # emit warning if not found?
                #         pass
                # df = NanoEvents.from_file(
                #     file=file,
                #     treename=item.treename,
                #     entrystart=item.entrystart,
                #     entrystop=item.entrystop,
                #     metadata={
                #         'dataset': item.dataset,
                #         'filename': item.filename
                #     },
                #     cache=cache,
                # )
            else:
                tree = file[item.treename]
                df = LazyDataFrame(tree,
                                   item.entrystart,
                                   item.entrystop,
                                   flatten=flatten)
                # For NanoAOD, we have to look at the "Runs" TTree for info such as weight sums
                # The different cases in the loop represent the different formats and accordingly
                # different ways of dealing with the provided values.
                for name in map(lambda x: x.decode('utf-8'),
                                file['Runs'].keys()):
                    arr = file['Runs'][name].array()
                    if name.startswith('n'):
                        # Check that all instances are the same, then save that value
                        tmp = set([])
                        for entry in arr:
                            tmp.add(entry)
                        assert (len(tmp) == 1)
                        df[name] = list(tmp)[0]
                    elif name in [
                            'genEventCount', 'genEventSumw', 'genEventSumw2'
                    ]:
                        # One entry per run -> just sum
                        df[name] = int(item.entrystart == 0) * arr.sum()
                    elif name in ['LHEScaleSumw', 'LHEPdfSumw']:
                        # Sum per variation, conserve number of variations
                        tmp = 0 * arr[0]
                        for i in range(len(arr)):
                            for j in range(len(arr[i])):
                                tmp[j] += arr[i][j]
                        df[name] = int(item.entrystart == 0) * tmp
                ### END NANOAOD
                df['dataset'] = item.dataset
                df['filename'] = item.filename
            tic = time.time()
            out = processor_instance.process(df)
            toc = time.time()
            metrics = dict_accumulator()
            if savemetrics:
                if isinstance(file.source, uproot.source.xrootd.XRootDSource):
                    metrics['bytesread'] = value_accumulator(
                        int, file.source.bytesread)
                    metrics['dataservers'] = set_accumulator(
                        {file.source._source.get_property('DataServer')})
                metrics['columns'] = set_accumulator(df.materialized)
                metrics['entries'] = value_accumulator(int, df.size)
                metrics['processtime'] = value_accumulator(float, toc - tic)
            wrapped_out = dict_accumulator({'out': out, 'metrics': metrics})
            file.source.close()
            break
        # catch xrootd errors and optionally skip
        # or retry to read the file
        except OSError as e:
            if not skipbadfiles:
                raise e
            else:
                w_str = 'Bad file source %s.' % item.filename
                if retries:
                    w_str += ' Attempt %d of %d.' % (retry_count + 1,
                                                     retries + 1)
                    if retry_count + 1 < retries:
                        w_str += ' Will retry.'
                    else:
                        w_str += ' Skipping.'
                else:
                    w_str += ' Skipping.'
                warnings.warn(w_str)
            metrics = dict_accumulator()
            if savemetrics:
                metrics['bytesread'] = value_accumulator(int, 0)
                metrics['dataservers'] = set_accumulator({})
                metrics['columns'] = set_accumulator({})
                metrics['entries'] = value_accumulator(int, 0)
                metrics['processtime'] = value_accumulator(float, 0)
            wrapped_out = dict_accumulator({'out': out, 'metrics': metrics})
        except Exception as e:
            if retries == retry_count:
                raise e
            w_str = 'Attempt %d of %d. Will retry.' % (retry_count + 1,
                                                       retries + 1)
            warnings.warn(w_str)
        retry_count += 1

    return wrapped_out
예제 #9
0
def run_uproot_job_nanoaod(fileset,
                           treename,
                           processor_instance,
                           executor,
                           executor_args={},
                           pre_executor=None,
                           pre_args=None,
                           chunksize=200000,
                           maxchunks=None,
                           metadata_cache=LRUCache(100000)):
    '''A tool to run a processor using uproot for data delivery
    A convenience wrapper to submit jobs for a file set, which is a
    dictionary of dataset: [file list] entries.  Supports only uproot
    reading, via the LazyDataFrame class.  For more customized processing,
    e.g. to read other objects from the files and pass them into data frames,
    one can write a similar function in their user code.
    Parameters
    ----------
        fileset : dict
            A dictionary ``{dataset: [file, file], }``
            Optionally, if some files' tree name differ, the dictionary can be specified:
            ``{dataset: {'treename': 'name', 'files': [file, file]}, }``
        treename : str
            name of tree inside each root file, can be ``None``;
            treename can also be defined in fileset, which will override the passed treename
        processor_instance : ProcessorABC
            An instance of a class deriving from ProcessorABC
        executor : callable
            A function that takes 3 arguments: items, function, accumulator
            and performs some action equivalent to:
            ``for item in items: accumulator += function(item)``
        executor_args : dict, optional
            Arguments to pass to executor.  See `iterative_executor`,
            `futures_executor`, `dask_executor`, or `parsl_executor` for available options.
            Some options that affect the behavior of this function:
            'savemetrics' saves some detailed metrics for xrootd processing (default False);
            'flatten' removes any jagged structure from the input files (default False);
            'processor_compression' sets the compression level used to send processor instance
            to workers (default 1).
        pre_executor : callable
            A function like executor, used to calculate fileset metadata
            Defaults to executor
        pre_args : dict, optional
            Similar to executor_args, defaults to executor_args
        chunksize : int, optional
            Maximum number of entries to process at a time in the data frame.
        maxchunks : int, optional
            Maximum number of chunks to process per dataset
            Defaults to processing the whole dataset
        metadata_cache : mapping, optional
            A dict-like object to use as a cache for (file, tree) metadata that is used to
            determine chunking.  Defaults to a in-memory LRU cache that holds 100k entries
            (about 1MB depending on the length of filenames, etc.)  If you edit an input file
            (please don't) during a session, the session can be restarted to clear the cache.
    '''
    if not isinstance(fileset, (Mapping, str)):
        raise ValueError(
            "Expected fileset to be a mapping dataset: list(files) or filename"
        )
    if not isinstance(processor_instance, ProcessorABC):
        raise ValueError(
            "Expected processor_instance to derive from ProcessorABC")

    if pre_executor is None:
        pre_executor = executor
    if pre_args is None:
        pre_args = dict(executor_args)
    if metadata_cache is None:
        metadata_cache = DEFAULT_METADATA_CACHE

    fileset = list(_normalize_fileset(fileset, treename))
    for filemeta in fileset:
        filemeta.maybe_populate(metadata_cache)

    # pop _get_metdata args here (also sent to _work_function)
    skipbadfiles = executor_args.pop('skipbadfiles', False)
    retries = executor_args.pop('retries', 0)
    xrootdtimeout = executor_args.pop('xrootdtimeout', None)
    align_clusters = executor_args.pop('align_clusters', False)
    metadata_fetcher = partial(
        _get_metadata,
        skipbadfiles=skipbadfiles,
        retries=retries,
        xrootdtimeout=xrootdtimeout,
        align_clusters=align_clusters,
    )

    chunks = []
    if maxchunks is None:
        # this is a bit of an abuse of map-reduce but ok
        to_get = set(filemeta for filemeta in fileset
                     if not filemeta.populated(clusters=align_clusters))
        if len(to_get) > 0:
            out = set_accumulator()
            pre_arg_override = {
                'desc': 'Preprocessing',
                'unit': 'file',
                'compression': None,
                'tailtimeout': None,
                'worker_affinity': False,
            }
            pre_args.update(pre_arg_override)
            pre_executor(to_get, metadata_fetcher, out, **pre_args)
            while out:
                item = out.pop()
                metadata_cache[item] = item.metadata
            for filemeta in fileset:
                filemeta.maybe_populate(metadata_cache)
        while fileset:
            filemeta = fileset.pop()
            if skipbadfiles and not filemeta.populated(
                    clusters=align_clusters):
                continue
            for chunk in filemeta.chunks(chunksize, align_clusters):
                chunks.append(chunk)
    else:
        # get just enough file info to compute chunking
        nchunks = defaultdict(int)
        while fileset:
            filemeta = fileset.pop()
            if nchunks[filemeta.dataset] >= maxchunks:
                continue
            if not filemeta.populated(clusters=align_clusters):
                filemeta.metadata = metadata_fetcher(filemeta).pop().metadata
                metadata_cache[filemeta] = filemeta.metadata
            if skipbadfiles and not filemeta.populated(
                    clusters=align_clusters):
                continue
            for chunk in filemeta.chunks(chunksize, align_clusters):
                chunks.append(chunk)
                nchunks[filemeta.dataset] += 1
                if nchunks[filemeta.dataset] >= maxchunks:
                    break

    # pop all _work_function args here
    savemetrics = executor_args.pop('savemetrics', False)
    flatten = executor_args.pop('flatten', False)
    mmap = executor_args.pop('mmap', False)
    nano = executor_args.pop('nano', False)
    cachestrategy = executor_args.pop('cachestrategy', None)
    pi_compression = executor_args.pop('processor_compression', 1)
    if pi_compression is None:
        pi_to_send = processor_instance
    else:
        pi_to_send = lz4f.compress(cloudpickle.dumps(processor_instance),
                                   compression_level=pi_compression)
    closure = partial(
        _work_function_nanoaod,
        flatten=flatten,
        savemetrics=savemetrics,
        mmap=mmap,
        nano=nano,
        cachestrategy=cachestrategy,
        skipbadfiles=skipbadfiles,
        retries=retries,
        xrootdtimeout=xrootdtimeout,
    )
    # hack around dask/dask#5503 which is really a silly request but here we are
    if executor is dask_executor:
        executor_args['heavy_input'] = pi_to_send
        closure = partial(closure, processor_instance='heavy')
    else:
        closure = partial(closure, processor_instance=pi_to_send)

    out = processor_instance.accumulator.identity()
    wrapped_out = dict_accumulator({'out': out, 'metrics': dict_accumulator()})
    exe_args = {
        'unit': 'chunk',
        'function_name': type(processor_instance).__name__,
    }
    exe_args.update(executor_args)
    executor(chunks, closure, wrapped_out, **exe_args)
    wrapped_out['metrics']['chunks'] = value_accumulator(int, len(chunks))
    processor_instance.postprocess(out)
    if savemetrics:
        return out, wrapped_out['metrics']
    return out
예제 #10
0
 def accumulator(self):
     return dict_accumulator({})
예제 #11
0
    def __init__(self, task):
        super().__init__(task)

        self._accumulator["arrays"] = dict_accumulator()
예제 #12
0
def run_uproot_job_nanoaod(fileset, treename, processor_instance, executor, executor_args={}, chunksize=500000, maxchunks=None):
    '''
    A convenience wrapper to submit jobs for a file set, which is a
    dictionary of dataset: [file list] entries.  Supports only uproot
    reading, via the LazyDataFrame class.  For more customized processing,
    e.g. to read other objects from the files and pass them into data frames,
    one can write a similar function in their user code.

    Parameters
    ----------
        fileset:
            dictionary {dataset: [file, file], }
        treename:
            name of tree inside each root file
        processor_instance:
            an instance of a class deriving from ProcessorABC
        executor:
            any of `iterative_executor`, `futures_executor`, etc.

            In general, a function that takes 3 arguments: items, function accumulator
            and performs some action equivalent to:
            for item in items: accumulator += function(item)
        executor_args:
            extra arguments to pass to executor
            currently supported:
                workers: number of parallel processes for futures
                pre_workers: number of parallel threads for calculating chunking
                savemetrics: save some detailed metrics for xrootd processing
                flatten: flatten all branches returned by the dataframe (no jagged structure)
        chunksize:
            number of entries to process at a time in the data frame
        maxchunks:
            maximum number of chunks to process per dataset
    '''
    if not isinstance(fileset, Mapping):
        raise ValueError("Expected fileset to be a mapping dataset: list(files)")
    if not isinstance(processor_instance, ProcessorABC):
        raise ValueError("Expected processor_instance to derive from ProcessorABC")

    executor_args.setdefault('workers', 1)
    executor_args.setdefault('pre_workers', 4 * executor_args['workers'])
    executor_args.setdefault('savemetrics', False)

    items = []
    for dataset, filelist in tqdm(fileset.items(), desc='Preprocessing'):
        if maxchunks is not None:
            chunks = _get_chunking_lazy(tuple(filelist), treename, chunksize)
        else:
            chunks = _get_chunking(tuple(filelist), treename, chunksize, executor_args['pre_workers'])
        for ichunk, chunk in enumerate(chunks):
            if (maxchunks is not None) and (ichunk > maxchunks):
                break
            items.append((dataset, chunk[0], treename, chunk[1], chunk[2], processor_instance))

    out = processor_instance.accumulator.identity()
    wrapped_out = dict_accumulator({'out': out, 'metrics': dict_accumulator()})
    executor(items, _work_function_nanoaod, wrapped_out, **executor_args)
    processor_instance.postprocess(out)
    if executor_args['savemetrics']:
        return out, wrapped_out['metrics']
    return out