def store_mcmc_iteration_info(self, proposed_params, proposed_loglike, accept, i_run): """ Records the MCMC iteration details :param proposed_params: the current parameter values :param proposed_loglike: the current loglikelihood :param accept: whether the iteration was accepted or not :param i_run: the iteration number """ mcmc_run_dict = { k: v for k, v in zip(self.param_list, proposed_params) } mcmc_run_dict["loglikelihood"] = proposed_loglike mcmc_run_dict["accept"] = 1 if accept else 0 mcmc_run_colnames = self.param_list.copy() mcmc_run_colnames.append("loglikelihood") mcmc_run_colnames.append("accept") mcmc_run_df = pd.DataFrame(mcmc_run_dict, columns=mcmc_run_colnames, index=[i_run]) store_database( mcmc_run_df, table_name="mcmc_run", run_idx=i_run, database_path=self.output_db_path, )
def store_model_outputs(self, scenario_index): """ Record the model outputs in the database """ _model = self.scenarios[scenario_index].model out_df = pd.DataFrame(_model.outputs, columns=_model.compartment_names) derived_output_df = pd.DataFrame.from_dict(_model.derived_outputs) store_database( derived_output_df, table_name="derived_outputs", run_idx=self.iter_num, database_name=self.output_db_path, scenario=scenario_index, ) store_database( out_df, table_name="outputs", run_idx=self.iter_num, times=_model.times, database_name=self.output_db_path, scenario=scenario_index, )
def store_model_outputs(self): """ Record the model outputs in the database """ scenario = self.latest_scenario assert scenario, "No model has been run" model = scenario.model out_df = pd.DataFrame(model.outputs, columns=model.compartment_names) derived_output_df = pd.DataFrame.from_dict(model.derived_outputs) store_database( derived_output_df, table_name="derived_outputs", run_idx=self.iter_num, database_path=self.output_db_path, scenario=scenario.idx, ) store_database( out_df, table_name="outputs", run_idx=self.iter_num, times=model.times, database_path=self.output_db_path, scenario=scenario.idx, )
def loglikelihood(self, params, to_return=BEST_LL): """ Calculate the loglikelihood for a set of parameters """ scenario, pp = self.run_model_with_params(params) model_start_time = pp.derived_outputs["times"][0] considered_start_times = [model_start_time] best_start_time = None if self.run_mode == CalibrationMode.LEAST_SQUARES: # Initial best loglikelihood is a very large +ve number. best_ll = 1.0e60 else: # Initial best loglikelihood is a very large -ve number. best_ll = -1.0e60 for considered_start_time in considered_start_times: time_shift = considered_start_time - model_start_time ll = 0 # loglikelihood if using bayesian approach. Sum of squares if using lsm mode for target in self.targeted_outputs: key = target["output_key"] data = np.array(target["values"]) time_weigths = target["time_weights"] if key in pp.generated_outputs: model_output = np.array(pp.generated_outputs[key]) else: indices = [] for year in target["years"]: time_idx = scenario.model.times.index(year - time_shift) indices.append(time_idx) model_output = np.array( [pp.derived_outputs[key][index] for index in indices]) if self.run_mode == CalibrationMode.LEAST_SQUARES: squared_distance = (data - model_output)**2 ll += np.sum([ w * d for (w, d) in zip(time_weigths, squared_distance) ]) else: if "loglikelihood_distri" not in target: # default distribution target["loglikelihood_distri"] = "normal" if target["loglikelihood_distri"] == "normal": if key + "_dispersion_param" in self.param_list: normal_sd = params[self.param_list.index( key + "_dispersion_param")] else: normal_sd = target["sd"] squared_distance = (data - model_output)**2 ll += -(0.5 / normal_sd**2) * np.sum([ w * d for (w, d) in zip(time_weigths, squared_distance) ]) elif target["loglikelihood_distri"] == "poisson": for i in range(len(data)): ll += (round(data[i]) * math.log( abs(model_output[i])) - model_output[i] - math.log(math.factorial(round( data[i])))) * time_weigths[i] elif target["loglikelihood_distri"] == "negative_binomial": assert key + "_dispersion_param" in self.param_list # the dispersion parameter varies during the MCMC. We need to retrieve its value n = [ params[i] for i in range(len(params)) if self.param_list[i] == key + "_dispersion_param" ][0] for i in range(len(data)): # We use the parameterisation based on mean and variance and assume define var=mean**delta mu = model_output[i] # work out parameter p to match the distribution mean with the model output p = mu / (mu + n) ll += stats.nbinom.logpmf(round( data[i]), n, 1.0 - p) * time_weigths[i] else: raise ValueError( "Distribution not supported in loglikelihood_distri" ) if self.run_mode == CalibrationMode.LEAST_SQUARES: is_new_best_ll = ll < best_ll else: is_new_best_ll = ll > best_ll if is_new_best_ll: best_ll, best_start_time = (ll, considered_start_time) if self.run_mode == CalibrationMode.LEAST_SQUARES: mcmc_run_dict = {k: v for k, v in zip(self.param_list, params)} mcmc_run_dict["loglikelihood"] = best_ll mcmc_run_colnames = self.param_list.copy() mcmc_run_colnames = mcmc_run_colnames.append("loglikelihood") mcmc_run_df = pd.DataFrame(mcmc_run_dict, columns=mcmc_run_colnames, index=[self.iter_num]) store_database( mcmc_run_df, table_name="mcmc_run", run_idx=self.iter_num, database_path=self.output_db_path, ) self.evaluated_params_ll.append( (copy.copy(params), copy.copy(best_ll))) if to_return == BEST_LL: return best_ll elif to_return == BEST_START: return best_start_time else: raise ValueError("to_return not recognised")
def loglikelihood(self, params, to_return="best_ll"): """ defines the loglikelihood :param params: model parameters :return: the loglikelihood """ # run the model best_ll = None # for evaluated in self.evaluated_params_ll: # if np.array_equal(params, evaluated[0]): # best_ll = evaluated[1] # break if best_ll is None: self.run_model_with_params(params) model_start_time = self.post_processing.derived_outputs["times"][0] if self.start_time_range is None: considered_start_times = [model_start_time] else: considered_start_times = np.linspace( self.start_time_range[0], self.start_time_range[1], num=self.start_time_range[1] - self.start_time_range[0] + 1, ) best_ll, best_start_time = (-1.0e60, None) if self.run_mode == "lsm": best_ll = 1.0e60 for considered_start_time in considered_start_times: time_shift = considered_start_time - model_start_time ll = 0 # loglikelihood if using bayesian approach. Sum of squares if using lsm mode for target in self.targeted_outputs: key = target["output_key"] data = np.array(target["values"]) if key in self.post_processing.generated_outputs: if self.start_time_range is not None: raise ValueError( "variable start time implemented for derived_outputs only" ) model_output = np.array( self.post_processing.generated_outputs[key]) else: indices = [] for year in target["years"]: indices.append(self.scenarios[0].model.times.index( year - time_shift)) model_output = np.array([ self.post_processing.derived_outputs[key][index] for index in indices ]) if self.run_mode == "lsm": ll += np.sum((data - model_output)**2) else: if "loglikelihood_distri" not in target: # default distribution target["loglikelihood_distri"] = "normal" if target["loglikelihood_distri"] == "normal": ll += -(0.5 / target["sd"]**2) * np.sum( (data - model_output)**2) elif target["loglikelihood_distri"] == "poisson": for i in range(len(data)): ll += ( data[i] * math.log(abs(model_output[i])) - model_output[i] - math.log(math.factorial(data[i]))) elif target[ "loglikelihood_distri"] == "negative_binomial": assert key + '_dispersion_param' in self.param_list # the dispersion parameter varies during the MCMC. We need to retrieve its value n = [ params[i] for i in range(len(params)) if self.param_list[i] == key + '_dispersion_param' ][0] for i in range(len(data)): # We use the parameterisation based on mean and variance and assume define var=mean**delta mu = model_output[i] # work out parameter p to match the distribution mean with the model output p = mu / (mu + n) ll += stats.nbinom.logpmf(data[i], n, 1. - p) else: raise ValueError( "Distribution not supported in loglikelihood_distri" ) if (ll > best_ll and self.run_mode != "lsm") or ( ll < best_ll and self.run_mode == "lsm"): best_ll, best_start_time = (ll, considered_start_time) if self.run_mode == "lsm": mcmc_run_dict = {k: v for k, v in zip(self.param_list, params)} mcmc_run_dict["loglikelihood"] = best_ll mcmc_run_colnames = self.param_list.copy() mcmc_run_colnames = mcmc_run_colnames.append("loglikelihood") mcmc_run_df = pd.DataFrame(mcmc_run_dict, columns=mcmc_run_colnames, index=[self.iter_num]) store_database( mcmc_run_df, table_name="mcmc_run", run_idx=self.iter_num, database_name=self.output_db_path, ) self.evaluated_params_ll.append( (copy.copy(params), copy.copy(best_ll))) if to_return == "best_ll": return best_ll elif to_return == "best_start_time": return best_start_time else: raise ValueError("to_return not recognised")