def _parallel_batcher(function, args_kwargs): # Create a combined set of properties necessary for all calls # NOTE: We don't need to handle patch properties because those are handled # internally by the process and we're only dealing with one process in # batch mode all_properties = set() for args, kwargs in args_kwargs: # Extract region and expressions _, region, expressions, _ = _parallel_extractor(*args, **kwargs) # Add region properties selection, weight = region.selection_weight() all_properties.update(properties(selection)) all_properties.update(properties(weight)) # Add expression properties all_properties.update(*(properties(e) for e in expressions)) # Go through all args/kwargs pairs and call the function for args, kwargs in args_kwargs: # Call the functions with load hints kwargs['load_hints'] = all_properties function(*args, **kwargs) # Force garbage collection gc.collect() # Clear the load caches of the caching loader _caching_loader.caches.clear()
def _count(process, region): """Computes the weighted event count of a process in a region. Args: process: The process whose events should be counted region: The region whose weighting/selection should be applied Returns: The weighted event count in the region. """ # Compute weighted selection selection, weight = region.selection_weight() # Compute the weighted selection properties required_properties = set() required_properties.update(properties(selection)) required_properties.update(properties(weight)) # Load data data = process.load(required_properties) # Apply selection if specified if selection != '': data = data[data.eval(normalized(selection))] # Compute the weighted or unweighted count if weight != '': return data.eval(normalized(weight)).sum() else: return len(data)
def properties(self): """Returns a Python set of properties of the data required to evaluate the patch. Returns: A Python set containing strings of the required patch properties. """ return properties(self._selection)
def _histogram(process, region, expressions, binnings, load_hints = None): """Generates a ROOT histogram of a distribution a process in a region. Args: process: The process whose events should be histogrammed region: The region whose weighting/selection should be applied expressions: A tuple of expression strings binnings: A tuple of Binning instances distribution: The distribution to histogram load_hints: If provided, this argument will hint to _histogram that it should load additional properties when loading data and that it should use the _caching_loader. This facilitates cached loading of data across multiple calls to _histogram with the same process. This is particularly useful for parallelized histogramming, where the jobs are grouped by process. Returns: A ROOT histogram, of the TH1F, TH2F, or TH3F variety. """ # Compute weighted selection selection, weight = region.selection_weight() # Expand binnings to edge lists edges = tuple((b.edges() for b in binnings)) # Load data if load_hints is not None: # If load_hints have been provided, just use those with the # _caching_loader data = _caching_loader(process, load_hints) else: # Otherwise manually create the set of necessary properties # NOTE: All we need to do are region and expression properties - patch # properties are handled internally by the process required_properties = set() # Add those properties necessary to evaluate region selection/weight required_properties.update(properties(selection)) required_properties.update(properties(weight)) # Add in those properties necessary to evaluate expressions required_properties.update(*(properties(e) for e in expressions)) # Load data data = process.load(required_properties) # Apply selection if specified if selection != '': data = data[data.eval(normalized(selection))] # Evaluate each variable expression, converting the resultant Pandas Series # to a NumPy array # HACK: TH1::FillN only supports 64-bit floating point values, so convert # things. Would be nice to find a better approach. samples = tuple((data.eval(normalized(e)).values.astype(numpy.float64) for e in expressions)) # Evaluate weights, converting the resultant Pandas Series to a NumPy array # HACK: TH1::FillN only supports 64-bit floating point values, so convert # things. Would be nice to find a better approach. if weight != '': weights = data.eval(normalized(weight)).values.astype(numpy.float64) else: weights = nullptr # Create a unique name and title for the histogram name = title = uuid4().hex # Create a histogram based on dimensionality # NOTE: When specifying explicit bin edges, you aren't passing a length # argument, you are passing an nbins argument, which is length - 1, hence # the code below. If you pass length for n bins, then you'll get garbage # for the last bin's upper edge and things go nuts in ROOT. dimensionality = len(expressions) count = len(data) if dimensionality == 1: # Create a one-dimensional histogram result = TH1F(name, title, len(edges[0]) - 1, edges[0]) # Fill the histogram # HACK: TH1::FillN will die if N == 0 if count > 0: result.FillN(count, samples[0], weights) elif dimensionality == 2: # Create a two-dimensional histogram result = TH2F(name, title, len(edges[0]) - 1, edges[0], len(edges[1]) - 1, edges[1]) # Fill the histogram # HACK: TH1::FillN will die if N == 0 if count > 0: result.FillN(count, samples[0], samples[1], weights) elif dimensionality == 3: # Create a three-dimensional histogram result = TH3F(name, title, len(edges[0]) - 1, edges[0], len(edges[1]) - 1, edges[1], len(edges[2]) - 1, edges[2]) # HACK: TH3 doesn't have a FillN method, so we have to do things the # slow way. # TODO: We may want to put a warning about this slowness if weights == nullptr: weights = numpy.ones(count, dtype = numpy.float64) for x, y, z, w in zip(samples[0], samples[1], samples[2], weights): result.Fill(x, y, z, w) else: raise ValueError('ROOT can only histograms 1 - 3 dimensions') # All done return result
def _histogram(process, region, expressions, binnings, load_hints=None): """Generates a ROOT histogram of a distribution a process in a region. Args: process: The process whose events should be histogrammed region: The region whose weighting/selection should be applied expressions: A tuple of expression strings binnings: A tuple of Binning instances distribution: The distribution to histogram load_hints: If provided, this argument will hint to _histogram that it should load additional properties when loading data and that it should use the _caching_loader. This facilitates cached loading of data across multiple calls to _histogram with the same process. This is particularly useful for parallelized histogramming, where the jobs are grouped by process. Returns: A ROOT histogram, of the TH1F, TH2F, or TH3F variety. """ # Compute weighted selection selection, weight = region.selection_weight() # Expand binnings to edge lists edges = tuple((b.edges() for b in binnings)) # Load data if load_hints is not None: # If load_hints have been provided, just use those with the # _caching_loader data = _caching_loader(process, load_hints) else: # Otherwise manually create the set of necessary properties # NOTE: All we need to do are region and expression properties - patch # properties are handled internally by the process required_properties = set() # Add those properties necessary to evaluate region selection/weight required_properties.update(properties(selection)) required_properties.update(properties(weight)) # Add in those properties necessary to evaluate expressions required_properties.update(*(properties(e) for e in expressions)) # Load data data = process.load(required_properties) # Apply selection if specified if selection != '': data = data[data.eval(normalized(selection))] # Evaluate each variable expression, converting the resultant Pandas Series # to a NumPy array # HACK: TH1::FillN only supports 64-bit floating point values, so convert # things. Would be nice to find a better approach. samples = tuple((data.eval(normalized(e)).values.astype(numpy.float64) for e in expressions)) # Evaluate weights, converting the resultant Pandas Series to a NumPy array # HACK: TH1::FillN only supports 64-bit floating point values, so convert # things. Would be nice to find a better approach. if weight != '': weights = data.eval(normalized(weight)).values.astype(numpy.float64) else: weights = nullptr # Create a unique name and title for the histogram name = title = uuid4().hex # Create a histogram based on dimensionality # NOTE: When specifying explicit bin edges, you aren't passing a length # argument, you are passing an nbins argument, which is length - 1, hence # the code below. If you pass length for n bins, then you'll get garbage # for the last bin's upper edge and things go nuts in ROOT. dimensionality = len(expressions) count = len(data) if dimensionality == 1: # Create a one-dimensional histogram result = TH1F(name, title, len(edges[0]) - 1, edges[0]) # Fill the histogram # HACK: TH1::FillN will die if N == 0 if count > 0: result.FillN(count, samples[0], weights) elif dimensionality == 2: # Create a two-dimensional histogram result = TH2F(name, title, len(edges[0]) - 1, edges[0], len(edges[1]) - 1, edges[1]) # Fill the histogram # HACK: TH1::FillN will die if N == 0 if count > 0: result.FillN(count, samples[0], samples[1], weights) elif dimensionality == 3: # Create a three-dimensional histogram result = TH3F(name, title, len(edges[0]) - 1, edges[0], len(edges[1]) - 1, edges[1], len(edges[2]) - 1, edges[2]) # HACK: TH3 doesn't have a FillN method, so we have to do things the # slow way. # TODO: We may want to put a warning about this slowness if weights == nullptr: weights = numpy.ones(count, dtype=numpy.float64) for x, y, z, w in zip(samples[0], samples[1], samples[2], weights): result.Fill(x, y, z, w) else: raise ValueError('ROOT can only histograms 1 - 3 dimensions') # All done return result
def test_properties(self): # Check a simple expression with duplicates self.assertEqual(properties("electron_pt > (x * x)"), set(["electron_pt", "x"]))
def test_properties(self): # Check a simple expression with duplicates self.assertEqual(properties('electron_pt > (x * x)'), set(['electron_pt', 'x']))