def get_weights(self,model,model_kwargs = None): """ Calculate weights for the variables in this category Args: model (callable): A model to be evaluated Keyword Args: model_kwargs (dict): Will be passed to model """ if not self.mc_p_readout: self.read_mc_primary() if model_kwargs is None: model_kwargs = dict() func_kwargs = {MC_P_EN : self.get(MC_P_EN),\ MC_P_TY : self.get(MC_P_TY),\ MC_P_WE : self.get(MC_P_WE)} for key in MC_P_ZE,MC_P_GW,MC_P_TS,DATASETS: reg = key if key == DATASETS: reg = 'mc_datasets' try: func_kwargs[reg] = self.get(key) except KeyError: Logger.warning("No MCPrimary {0} informatiion! Trying to omit..".format(key)) func_kwargs.update(model_kwargs) Logger.info("Getting weights for datasets {}".format(self.datasets.__repr__())) self._weights = pd.Series(self._weightfunction(model,self.datasets,\ **func_kwargs))
def set_run_start_stop(self,runstart_var=variables.Variable(None),runstop_var=variables.Variable(None)): """ Let the simulation category know which are the paramters describing the primary Keyword Args: runstart_var (pyevself.variables.variables.Variable): beginning of a run runstop_var (pyevself.variables.variables.Variable): beginning of a run """ #FIXME for var,name in [(runstart_var,RUN_START),(runstop_var,RUN_STOP)]: if var.name is None: Logger.warning("No {0} available".format(name)) elif name in self.vardict: Logger.info("..{0} already defined, skipping...".format(name)) continue else: if var.name != name: Logger.info("..renaming {0} to {1}..".format(var.name,name)) var.name = name newvar = deepcopy(var) self.vardict[name] = newvar self._runstartstop_set = True
def read_mc_primary(self,energy_var=MC_P_EN,\ type_var=MC_P_TY,\ zenith_var=MC_P_ZE,\ weight_var=MC_P_WE): """ Trigger the readout of MC Primary information Rename variables to magic keywords if necessary Keyword Args: energy_var (str): simulated primary energy type_var (str): simulated primary type zenith_var (str): simulated primary zenith weight_var (str): a weight, e.g. interaction propability """ self.read_variables([energy_var,type_var,zenith_var,weight_var]) for varname,defaultname in [(energy_var,MC_P_EN),\ (type_var,MC_P_TY),\ (zenith_var,MC_P_ZE), (weight_var,MC_P_WE)]: if varname != defaultname: Logger.warning("..renaming {} to {}..".format(varname,defaultname)) self.vardict[varname].name = defaultname self._mc_p_readout = True
def estimate_livetime(self,force=False): """ Calculate the livetime from run start/stop times, account for gaps Keyword Args: force (bool): overide existing livetime """ if self.livetime and (not self.livetime=="guess"): Logger.warning("There is already a livetime of {:4.2f} ".format(self.livetime)) if force: Logger.warning("Applying force...") else: Logger.warning("If you really want to do this, use force = True") return if not self._runstartstop_set: if (RUN_STOP in self.vardict.keys()) and (RUN_START in self.vardict.keys()): self._runstartstop_set = True else: Logger.warning("Need to set run start and stop times first! use object.set_run_start_stop") return Logger.warning("This is a crude estimate! Rather use a good run list or something!") lengths = self.get(RUN_STOP) - self.get(RUN_START) gaps = self.get(RUN_START)[1:] - self.get(RUN_STOP)[:-1] #trust me! #h = self.nodes["header"].read() #h0 = h[:-1] #h1 = h[1:] ##FIXME #lengths = ((h["time_end_mjd_day"] - h["time_start_mjd_day"]) * 24. * 3600. + # (h["time_end_mjd_sec"] - h["time_start_mjd_sec"]) + # (h["time_end_mjd_ns"] - h["time_start_mjd_ns"])*1e-9 ) #gaps = ((h1["time_start_mjd_day"] - h0["time_end_mjd_day"]) * 24. * 3600. + # (h1["time_start_mjd_sec"] - h0["time_end_mjd_sec"]) + # (h1["time_start_mjd_ns"] - h0["time_end_mjd_ns"])*1e-9) # detector livetime is the duration of all events + the length of all # gaps between events that are short enough to be not downtime. ( guess: 30s) est_ltime = ( lengths.sum() + gaps[(0<gaps) & (gaps<30)].sum() ) self.set_livetime(est_ltime) return
def get_files(self,*args,**kwargs): """ Load files for this category uses pyevsel.utils.files.harvest_files Args: *args (list of strings): Path to possible files Keyword Args: datasets (dict(dataset_id : nfiles)): i given, load only files from dataset dataset_id set nfiles parameter to amount of L2 files the loaded files will represent force (bool): forcibly reload filelist (pre-readout vars will be lost) all other kwargs will be passed to utils.files.harvest_files """ force = False if "force" in kwargs: force = kwargs.pop("force") if self.is_harvested: Logger.info("Variables have already been harvested!\ if you really want to reload the filelist,\ use 'force=True'.\ If you do so, all your harvested variables will be deleted!") if not force: return else: Logger.warning("..using force..") if "datasets" in kwargs: filtered_files = [] self.datasets = kwargs.pop("datasets") files = harvest_files(*args,**kwargs) datasets = [self._ds_regexp(x) for x in files] assert len(datasets) == len(files) ds_files = zip(datasets,files) for k in self.datasets.keys(): filtered_files.extend([x[1] for x in ds_files if x[0] == k]) files = filtered_files else: files = harvest_files(*args,**kwargs) self.files = files
def GetCategoryConfig(name): """ Get the relevant config section from the actual config for a category Args: name (string): Name of a category to search for """ configs = yaml.load(open(CONFIGFILE, "r")) for cfg in configs["categories"]: if cfg["name"] == name: # FIXME little hack for bad latex parsing # by yaml # cleanlabel = cfg["label"] cleanlabel = SLASHES.sub(r"\\", cfg["label"]) cfg["label"] = cleanlabel return cfg Logger.warning("No config for {0} found!".format(name)) return cfg
def __getitem__(self,item): if item in self: return self.get(item) else: return item seaborn_loaded = False try: import seaborn.apionly as sb seaborn_loaded = True Logger.debug("Seaborn found!") except ImportError: Logger.warning("Seaborn not found! Using predefined color palette") def get_color_palette(name="dark"): """ Load a color pallete, use seaborn if available """ if not seaborn_loaded: color_palette = ColorDict() # stolen from seaborn color-palette color_palette[0] = (0.2980392156862745, 0.4470588235294118, 0.6901960784313725) color_palette[5] = (0.8, 0.7254901960784313, 0.4549019607843137)#(0.3921568627450 9803, 0.7098039215686275, 0.803921568627451) color_palette["k"] = "k" color_palette[1] = (0.3333333333333333, 0.6588235294117647, 0.40784313725490196) color_palette[2] = (0.7686274509803922, 0.3058823529411765, 0.3215686274509804) color_palette[3] = (0.5058823529411764, 0.4470588235294118, 0.6980392156862745) color_palette['prohibited'] = 'grey'
def load_dataset(config, variables=None): """ Loads a dataset according to a configuration file Args: config (str): json style config file """ # FIXME: os.path exits tests cfg = commentjson.load(open(config)) categories = dict() weightfunctions = dict() models = dict() files_basepath = cfg["files_basepath"] for cat in cfg["categories"].keys(): thiscat = cfg["categories"][cat] if thiscat["datatype"] == "simulation": categories[cat] = c.Simulation(cat) # remember that json keys are strings, so # convert to int datasets = {int(x): int(thiscat["datasets"][x]) for x in thiscat["datasets"]} categories[cat].get_files( os.path.join(files_basepath, thiscat["subpath"]), prefix=thiscat["file_prefix"], datasets=datasets, ending=thiscat["file_type"], ) try: fluxclass, flux = thiscat["model"].split(".") models[cat] = getattr(dict(inspect.getmembers(fluxes))[fluxclass], flux) except ValueError: Logger.warning( "{} does not seem to be a valid model for {}. This might cause troubles. If not, it is probably fine!".format( thiscat["model"], cat ) ) models[cat] = None weightfunctions[cat] = dict(inspect.getmembers(wgt))[thiscat["model_method"]] elif thiscat["datatype"] == "data": categories[cat] = c.Data(cat) categories[cat].get_files( os.path.join(files_basepath, thiscat["subpath"]), prefix=thiscat["file_prefix"], ending=thiscat["file_type"], ) models[cat] = float(thiscat["livetime"]) weightfunctions[cat] = dict(inspect.getmembers(wgt))[thiscat["model_method"]] elif thiscat["datatype"] == "reweighted": pass else: raise TypeError("Data type not understood. Has to be either 'simulation', 'reweighted' or 'data'!!") # at last we can take care of reweighted categories for cat in cfg["categories"].keys(): thiscat = cfg["categories"][cat] if thiscat["datatype"] == "reweighted": categories[cat] = c.ReweightedSimulation(cat, categories[thiscat["parent"]]) if thiscat["model"]: fluxclass, flux = thiscat["model"].split(".") models[cat] = getattr(dict(inspect.getmembers(fluxes))[fluxclass], flux) weightfunctions[cat] = dict(inspect.getmembers(wgt))[thiscat["model_method"]] elif thiscat["datatype"] in ["data", "simulation"]: pass else: raise TypeError("Data type not understood. Has to be either 'simulation', 'reweighted' or 'data'!!") # combined categories combined_categories = dict() for k in combined_categories.keys(): combined_categories[k] = [categories[l] for l in cfg["combined_categories"]] # import variable defs vardefs = __import__(cfg["variable_definitions"]) dataset = ds.Dataset(*categories.values(), combined_categories=combined_categories) dataset.read_variables(vardefs, names=variables) dataset.set_weightfunction(weightfunctions) dataset.get_weights(models=models) return dataset
def read_variables(self,names=None): """ Harvest the variables in self.vardict Keyword Args: names (list): havest only these variables """ if names is None: names = self.vardict.keys() compound_variables = [] #harvest them later executor = fut.ProcessPoolExecutor(max_workers=MAX_CORES) future_to_varname = {} # first read out variables, # then compound variables # so make sure they are in the # right order simple_vars = [] for varname in names: try: if isinstance(self.vardict[varname],variables.CompoundVariable): compound_variables.append(varname) continue elif isinstance(self.vardict[varname],variables.VariableList): compound_variables.append(varname) continue else: simple_vars.append(varname) except KeyError: Logger.warning("Cannot find {} in variables!".format(varname)) continue for varname in simple_vars: # FIXME: Make it an option to not use # multi cpu readout! #self.vardict[varname].data = variables.harvest(self.files,self.vardict[varname].definitions) future_to_varname[executor.submit(variables.harvest,self.files,self.vardict[varname].definitions)] = varname #for future in tqdm.tqdm(fut.as_completed(future_to_varname),desc="Reading {0} variables".format(self.name), leave=True): progbar = False try: import pyprind n_it = len(future_to_varname.keys()) bar = pyprind.ProgBar(n_it,monitor=False,bar_char='#',title=self.name) progbar = True except ImportError: pass exc_caught = """""" for future in fut.as_completed(future_to_varname): varname = future_to_varname[future] Logger.debug("Reading {} finished".format(varname)) try: data = future.result() Logger.debug("Found {} entries ...".format(len(data))) data = self.vardict[varname].transform(data) except Exception as exc: exc_caught += "Reading {} for {} generated an exception: {}\n".format(varname,self.name, exc) data = pd.Series([]) self.vardict[varname].data = data self.vardict[varname].declare_harvested() if progbar: bar.update() for varname in compound_variables: #FIXME check if this causes a memory leak self.vardict[varname].rewire_variables(self.vardict) self.vardict[varname].harvest() if exc_caught: Logger.warning("During the variable readout some exceptions occured!\n" + exc_caught) self._is_harvested = True
import os import pandas as pd import tables import abc from pyevsel.utils import files as f from pyevsel.utils.logger import Logger DEFAULT_BINS = 70 REGISTERED_FILEEXTENSIONS = [".h5",".root"] try: import root_numpy as rn except ImportError: Logger.warning("No root_numpy found, root support is limited!") REGISTERED_FILEEXTENSIONS.remove(".root") ################################################################ # define a non-member function so that it can be used in a # multiprocessing approach #def harvest_single_file(filename, filetype, definitions): # """ # Get the variable data from a fileobject # Optimized for hdf files # # Args: # filename (str): # filetype (str): the extension of the filename, eg "h5" #