def to_yaml_file(self, name): """Convert fit result to YAML format. File name is determined by get_fit_result_path. Arguments: name (str): Name of the fit result. Return: str: Output file name. Raise: NotInitializedError: If the fit result has not been initialized. """ with _paths.work_on_file( name, path_func=_paths.get_fit_result_path) as file_name: write_config(self.to_yaml(), file_name) return file_name
def write_to_disk(self, name, link_from=None): """Write efficiency object to disk. Arguments: name (str): Name of the efficiency object. link_from (str, optional): Storage to link from. Defaults to no link. Return: str: Path of the output file. """ if not self.MODEL_NAME: raise NotImplementedError("Cannot save generic Efficiency") with work_on_file(name, get_efficiency_path, link_from) as file_name: write_config( { 'model': self.MODEL_NAME, 'variables': self.get_variables(), 'parameters': self._config }, file_name) return file_name
def run(config_files, link_from, verbose): """Run the script. Run a generate/fit sequence as many times as requested. Arguments: config_files (list[str]): Path to the configuration files. link_from (str): Path to link the results from. verbose (bool): Give verbose output? Raise: OSError: If the configuration file or some other input does not exist. AttributeError: If the input data are incompatible with a previous fit. KeyError: If some configuration data are missing. ValueError: If there is any problem in configuring the PDF factories. RuntimeError: If there is a problem during the fitting. """ try: config = _config.load_config( *config_files, validate=['syst/ntoys', 'name', 'randomizer']) except OSError: raise OSError( "Cannot load configuration files: {}".format(config_files)) except _config.ConfigError as error: if 'syst/ntoys' in error.missing_keys: logger.error("Number of toys not specified") if 'name' in error.missing_keys: logger.error("No name was specified in the config file!") if 'randomizer' in error.missing_keys: logger.error( "No randomizer configuration specified in config file!") raise KeyError("ConfigError raised -> {}".format(error.missing_keys)) except KeyError as error: logger.error("YAML parsing error -> %s", error) raise model_name = config['syst'].get('model', 'model') # TODO: 'model' returns name? try: model_config = config[model_name] except KeyError as error: logger.error("Missing model configuration -> %s", str(error)) raise KeyError("Missing model configuration") # Load fit model try: fit_model = configure_model(copy.deepcopy(model_config)) randomizer_model = configure_model(copy.deepcopy(model_config)) except KeyError: logger.exception('Error loading model') raise ValueError('Error loading model') # Some info ntoys = config['syst'].get('ntoys-per-job', config['syst']['ntoys']) logger.info("Doing %s generate/fit sequences", ntoys) logger.info("Systematics job name: %s", config['name']) if link_from: config['link-from'] = link_from if 'link-from' in config: logger.info("Linking toy data from %s", config['link-from']) else: logger.debug("No linking specified") # Now load the acceptance try: acceptance = get_acceptance(config['acceptance']) \ if 'acceptance' in config \ else None except _config.ConfigError as error: raise KeyError("Error loading acceptance -> {}".format(error)) # Fit strategy fit_strategy = config['syst'].get('strategy', 'simple') # Load randomizer configuration randomizer = get_randomizer(config['randomizer'])( model=randomizer_model, config=config['randomizer'], acceptance=acceptance) # Set seed job_id = get_job_id() # Start looping fit_results = {} logger.info("Starting sampling-fit loop (print frequency is 20)") initial_mem = memory_usage() initial_time = default_timer() do_extended = config['syst'].get('extended', False) do_minos = config['syst'].get('minos', False) for fit_num in range(ntoys): # Logging if (fit_num + 1) % 20 == 0: logger.info(" Fitting event %s/%s", fit_num + 1, ntoys) # Generate a dataset seed = get_urandom_int(4) np.random.seed(seed=seed) ROOT.RooRandom.randomGenerator().SetSeed(seed) try: # Get a randomized dataset and fit it with the nominal fit dataset = randomizer.get_dataset(randomize=True) gen_values = randomizer.get_current_values() fit_result_nominal = fit(fit_model, model_name, fit_strategy, dataset, verbose, Extended=do_extended, Minos=do_minos) # Fit the randomized dataset with the randomized values as nominal fit_result_rand = fit(randomizer_model, model_name, fit_strategy, dataset, verbose, Extended=do_extended, Minos=do_minos) randomizer.reset_values( ) # Needed to avoid generating unphysical values except ValueError: raise RuntimeError() except Exception: # logger.exception() raise RuntimeError() # TODO: provide more information? result = {} result['fitnum'] = fit_num result['seed'] = seed # Save the results of the randomized fit result_roofit_rand = FitResult.from_roofit(fit_result_rand) result['param_names'] = result_roofit_rand.get_fit_parameters().keys() result['rand'] = result_roofit_rand.to_plain_dict() result['rand_cov'] = result_roofit_rand.get_covariance_matrix() _root.destruct_object(fit_result_rand) # Save the results of the nominal fit result_roofit_nominal = FitResult.from_roofit(fit_result_nominal) result['nominal'] = result_roofit_nominal.to_plain_dict() result['nominal_cov'] = result_roofit_nominal.get_covariance_matrix() result['gen'] = gen_values _root.destruct_object(result_roofit_nominal) _root.destruct_object(dataset) fit_results[fit_num] = result logger.debug("Cleaning up") logger.info("Fitting loop over") logger.info("--> Memory leakage: %.2f MB/sample-fit", (memory_usage() - initial_mem) / ntoys) logger.info("--> Spent %.0f ms/sample-fit", (default_timer() - initial_time) * 1000.0 / ntoys) logger.info("Saving to disk") data_res = [] cov_matrices = {} # Get covariance matrices for fit_num, fit_res_i in fit_results.items(): fit_res = { 'fitnum': fit_res_i['fitnum'], 'seed': fit_res_i['seed'], 'model_name': model_name, 'fit_strategy': fit_strategy } param_names = fit_res_i['param_names'] cov_folder_rand = os.path.join(str(job_id), str(fit_res['fitnum']), 'rand') cov_matrices[cov_folder_rand] = pd.DataFrame(fit_res_i['rand_cov'], index=param_names, columns=param_names) cov_folder_nominal = os.path.join(str(job_id), str(fit_res['fitnum']), 'nominal') cov_matrices[cov_folder_nominal] = pd.DataFrame( fit_res_i['nominal_cov'], index=param_names, columns=param_names) for res_name, res_value in fit_res_i['rand'].items(): fit_res['{}_rand'.format(res_name)] = res_value for res_name, res_value in fit_res_i['nominal'].items(): fit_res['{}_nominal'.format(res_name)] = res_value for res_name, res_value in fit_res_i['gen'].items(): fit_res['{}_gen'.format(res_name)] = res_value data_res.append(fit_res) data_frame = pd.DataFrame(data_res) fit_result_frame = pd.concat([ data_frame, pd.concat([pd.DataFrame({'jobid': [job_id]})] * data_frame.shape[0]).reset_index(drop=True) ], axis=1) try: # pylint: disable=E1101 with _paths.work_on_file(config['name'], path_func=_paths.get_toy_fit_path, link_from=config.get('link-from', None)) as toy_fit_file: with modify_hdf(toy_fit_file) as hdf_file: # First fit results hdf_file.append('fit_results', fit_result_frame) # Save covarinance matrix under 'covariance/jobid/fitnum for cov_folder, cov_matrix in cov_matrices.items(): cov_path = os.path.join('covariance', cov_folder) hdf_file.append(cov_path, cov_matrix) # Generator info hdf_file.append( 'input_values', pd.DataFrame.from_dict(randomizer.get_input_values(), orient='index')) logger.info("Written output to %s", toy_fit_file) if 'link-from' in config: logger.info("Linked to %s", config['link-from']) except OSError as excp: logger.error(str(excp)) raise except ValueError as error: logger.exception("Exception on dataset saving") raise RuntimeError(str(error))
def run(config_files, link_from): """Run the script. Arguments: config_files (list[str]): Path to the configuration files. link_from (str): Path to link the results from. Raise: KeyError: If some configuration data are missing. OSError: If there either the configuration file does not exist or if there is a problem preparing the output path. ValueError: If there is any problem in configuring the PDF factories. RuntimeError: If there is a problem during the generation. """ # Configure try: config = load_config(*config_files, validate=['gen/nevents', 'name', 'gen-model']) except OSError: raise OSError( "Cannot load configuration files: {}".format(config_files)) except ConfigError as error: if 'gen/nevents' in error.missing_keys: logger.error("Number of events not specified") if 'name' in error.missing_keys: logger.error("No name was specified in the config file!") if 'gen-model' in error.missing_keys: logger.error( "No generation model were specified in the config file!") raise KeyError("ConfigError raised -> {}".format(error.missing_keys)) except KeyError as error: logger.error("YAML parsing error -> %s", error) raise # Ignore renaming logger.info("Generating %s events", config['gen']['nevents']) logger.info("Generation job name: %s", config['name']) if link_from: config['link-from'] = link_from if 'link-from' in config: logger.info("Linking toy data from %s", config['link-from']) else: logger.debug("No linking specified") # Set seed job_id = get_job_id() seed = get_urandom_int(4) np.random.seed(seed=seed) ROOT.RooRandom.randomGenerator().SetSeed(seed) # Generate try: physics = configure_model(config['gen-model']) except KeyError as error: logger.error("Cannot find physics factory") raise ValueError('{}'.format(error)) except ValueError: logger.error("Problem dealing with shared parameters") raise if isinstance(physics, (SumPhysicsFactory, SimultaneousPhysicsFactory)): logger.warning("Generating a RooAddPdf or a RooSimultaneous: " "yields will be generated at a fixed value") try: dataset = generate( physics, config['gen'].get('nevents-per-job', config['gen']['nevents'])) except ValueError as error: logger.exception("Exception on generation") raise RuntimeError(str(error)) # Get toy information toy_info = { var.GetName(): [var.getVal()] for var in physics.get_gen_parameters() } n_evts = sum(config['gen']['nevents'].values()) \ if isinstance(config['gen']['nevents'], dict) \ else config['gen']['nevents'] toy_info.update({'seed': [seed], 'jobid': [job_id], 'nevents': n_evts}) try: # Save with work_on_file(config['name'], path_func=get_toy_path, link_from=config.get('link-from')) as toy_file: with modify_hdf(toy_file) as hdf_file: hdf_file.append('data', dataset.assign(jobid=job_id)) hdf_file.append('toy_info', pd.DataFrame(toy_info)) # Say something logger.info("Written output to %s", toy_file) if 'link-from' in config: logger.info("Linked to %s", config['link-from']) except OSError as excp: logger.error(str(excp)) raise except ValueError as error: logger.exception("Exception on dataset saving") raise RuntimeError(str(error))
def run(config_files, link_from, verbose): """Run the script. Run a sample/fit sequence as many times as requested. Arguments: config_files (list[str]): Path to the configuration files. link_from (str): Path to link the results from. verbose (bool): Give verbose output? Raise: OSError: If there either the configuration file does not exist some of the input toys cannot be found. AttributeError: If the input data are incompatible with a previous fit. KeyError: If some configuration data are missing. ValueError: If there is any problem in configuring the PDF factories. RuntimeError: If there is a problem during the fitting. """ try: config = _config.load_config(*config_files, validate=['fit/nfits', 'name', 'data']) except OSError: raise OSError( "Cannot load configuration files: {}".format(config_files)) except ConfigError as error: if 'fit/nfits' in error.missing_keys: logger.error("Number of fits not specified") if 'name' in error.missing_keys: logger.error("No name was specified in the config file!") if 'data' in error.missing_keys: logger.error("No input data specified in the config file!") raise KeyError("ConfigError raised -> {}".format(error.missing_keys)) except KeyError as error: logger.error("YAML parsing error -> %s", error) try: models = { model_name: config[model_name] for model_name in config['fit'].get('models', ['model']) } except KeyError as error: logger.error("Missing model configuration -> %s", str(error)) raise KeyError("Missing model configuration") if not models: logger.error( "Empty list specified in the config file under 'fit/models'!") raise KeyError() fit_strategies = config['fit'].get('strategies', ['simple']) if not fit_strategies: logger.error("Empty fit strategies were specified in the config file!") raise KeyError() # Some info nfits = config['fit'].get('nfits-per-job', config['fit']['nfits']) logger.info("Doing %s sample/fit sequences", nfits) logger.info("Fit job name: %s", config['name']) if link_from: config['link-from'] = link_from if 'link-from' in config: logger.info("Linking toy data from %s", config['link-from']) else: logger.debug("No linking specified") # Analyze data requirements logger.info("Loading input data") data = {} gen_values = {} if len(set('category' in data_source for data_source in config['data'])) > 1: raise KeyError("Categories in 'data' not consistently specified.") for data_id, data_source in config['data'].items(): try: source_toy = data_source['source'] except KeyError: logger.error("Data source not specified") raise data[data_id] = (get_data({ 'source': source_toy, 'source-type': 'toy', 'tree': 'data', 'output-format': 'pandas', 'selection': data_source.get('selection') }), data_source['nevents'], data_source.get('poisson'), data_source.get('category')) # Generator values toy_info = get_data({ 'source': source_toy, 'source-type': 'toy', 'tree': 'toy_info', 'output-format': 'pandas' }) gen_values[data_id] = {} for var_name in toy_info.columns: if var_name in ('seed', 'jobid', 'nevents'): continue gen_values[data_id][var_name] = toy_info[var_name].iloc[0] try: fit_models = {} for model_name in models: if model_name not in config: raise KeyError( "Missing model definition -> {}".format(model_name)) fit_models[model_name] = configure_model(config[model_name]) if any(yield_.isConstant() for yield_ in fit_models[model_name].get_yield_vars() if yield_): logger.warning( "Model %s has constant yields. " "Be careful when configuring the input data, you may need to disable poisson sampling", model_name) except KeyError: logger.exception("Error loading model") raise ValueError("Error loading model") if len(set(model.is_extended() for model in fit_models.values())) == 2: logger.error("Mix of extended and non-extended models!") raise ValueError("Error loading fit models") # Let's check these generator values against the output file try: gen_values_frame = {} # pylint: disable=E1101 with _paths.work_on_file(config['name'], _paths.get_toy_fit_path, config.get('link-from')) as toy_fit_file: with modify_hdf(toy_fit_file) as hdf_file: logger.debug("Checking generator values") test_gen = [('gen_{}'.format(data_source)) in hdf_file for data_source in gen_values] if all(test_gen ): # The data were written already, crosscheck values for source_id, gen_value in gen_values.items(): if not all( hdf_file['gen_{}'.format(data_source)] [var_name].iloc[0] == var_value for var_name, var_value in gen_value.items()): raise AttributeError( "Generated and stored values don't match for source '{}'" .format(source_id)) elif not any(test_gen): # No data were there, just overwrite for source_id, gen_values in gen_values.items(): gen_data = { 'id': source_id, 'source': _paths.get_toy_path( config['data'][source_id]['source']), 'nevents': config['data'][source_id]['nevents'] } gen_data.update(gen_values) gen_values_frame['gen_{}'.format( source_id)] = pd.DataFrame([gen_data]) else: raise AttributeError("Inconsistent number of data sources") except OSError as excp: logger.error(str(excp)) raise # Now load the acceptance try: acceptance = get_acceptance(config['acceptance']) \ if 'acceptance' in config \ else None except ConfigError as error: raise KeyError("Error loading acceptance -> {}".format(error)) # Prepare output gen_events = defaultdict(list) # Set seed job_id = get_job_id() if job_id: seed = int(job_id.split('.')[0]) else: import random job_id = 'local' seed = random.randint(0, 100000) np.random.seed(seed=seed) ROOT.RooRandom.randomGenerator().SetSeed(seed) # Start looping fit_results = defaultdict(list) logger.info("Starting sampling-fit loop (print frequency is 20)") initial_mem = memory_usage() initial_time = default_timer() for fit_num in range(nfits): # Logging if (fit_num + 1) % 20 == 0: logger.info(" Fitting event %s/%s", fit_num + 1, nfits) # Get a compound dataset seed = get_urandom_int(4) np.random.seed(seed=seed) ROOT.RooRandom.randomGenerator().SetSeed(seed) try: logger.debug("Sampling input data") datasets, sample_sizes = get_datasets(data, acceptance, fit_models) for sample_name, sample_size in sample_sizes.items(): gen_events['N^{{{}}}_{{gen}}'.format(sample_name)].append( sample_size) logger.debug("Sampling finalized") except KeyError: logger.exception("Bad data configuration") raise logger.debug("Fitting") for model_name in models: dataset = datasets.pop(model_name) fit_model = fit_models[model_name] # Now fit for fit_strategy in fit_strategies: toy_key = (model_name, fit_strategy) try: fit_result = fit(fit_model, model_name, fit_strategy, dataset, verbose, Extended=config['fit'].get( 'extended', False), Minos=config['fit'].get('minos', False)) except ValueError: raise RuntimeError() # Now results are in fit_parameters result_roofit = FitResult.from_roofit(fit_result) result = result_roofit.to_plain_dict() result['cov_matrix'] = result_roofit.get_covariance_matrix() result['param_names'] = result_roofit.get_fit_parameters( ).keys() result['fitnum'] = fit_num result['seed'] = seed fit_results[toy_key].append(result) _root.destruct_object(fit_result) _root.destruct_object(dataset) logger.debug("Cleaning up") logger.info("Fitting loop over") logger.info("--> Memory leakage: %.2f MB/sample-fit", (memory_usage() - initial_mem) / nfits) logger.info("--> Spent %.0f ms/sample-fit", (default_timer() - initial_time) * 1000.0 / nfits) logger.info("Saving to disk") data_res = [] cov_matrices = {} # Get gen values for this model for (model_name, fit_strategy), fits in fit_results.items(): for fit_res in fits: fit_res = fit_res.copy() fit_res['model_name'] = model_name fit_res['fit_strategy'] = fit_strategy cov_folder = os.path.join(str(job_id), str(fit_res['fitnum'])) param_names = fit_res.pop('param_names') cov_matrices[cov_folder] = pd.DataFrame(fit_res.pop('cov_matrix'), index=param_names, columns=param_names) data_res.append(fit_res) data_frame = pd.DataFrame(data_res) fit_result_frame = pd.concat([ pd.DataFrame(gen_events), data_frame, pd.concat([pd.DataFrame({'jobid': [job_id]})] * data_frame.shape[0]).reset_index(drop=True) ], axis=1) try: # pylint: disable=E1101 with _paths.work_on_file( config['name'], path_func=_paths.get_toy_fit_path, link_from=config.get('link-from')) as toy_fit_file: with modify_hdf(toy_fit_file) as hdf_file: # First fit results hdf_file.append('fit_results', fit_result_frame) # Save covarinance matrix under 'covariance/jobid/fitnum for cov_folder, cov_matrix in cov_matrices.items(): cov_path = os.path.join('covariance', cov_folder) hdf_file.append(cov_path, cov_matrix) # Generator info for key_name, gen_frame in gen_values_frame.items(): hdf_file.append(key_name, gen_frame) logger.info("Written output to %s", toy_fit_file) if 'link-from' in config: logger.info("Linked to %s", config['link-from']) except OSError as excp: logger.error(str(excp)) raise except ValueError as error: logger.exception("Exception on dataset saving") raise RuntimeError(str(error))