Exemple #1
0
 def store_mcmc_iteration_info(self, proposed_params, proposed_loglike,
                               accept, i_run):
     """
     Records the MCMC iteration details
     :param proposed_params: the current parameter values
     :param proposed_loglike: the current loglikelihood
     :param accept: whether the iteration was accepted or not
     :param i_run: the iteration number
     """
     mcmc_run_dict = {
         k: v
         for k, v in zip(self.param_list, proposed_params)
     }
     mcmc_run_dict["loglikelihood"] = proposed_loglike
     mcmc_run_dict["accept"] = 1 if accept else 0
     mcmc_run_colnames = self.param_list.copy()
     mcmc_run_colnames.append("loglikelihood")
     mcmc_run_colnames.append("accept")
     mcmc_run_df = pd.DataFrame(mcmc_run_dict,
                                columns=mcmc_run_colnames,
                                index=[i_run])
     store_database(
         mcmc_run_df,
         table_name="mcmc_run",
         run_idx=i_run,
         database_path=self.output_db_path,
     )
Exemple #2
0
 def store_model_outputs(self, scenario_index):
     """
     Record the model outputs in the database
     """
     _model = self.scenarios[scenario_index].model
     out_df = pd.DataFrame(_model.outputs, columns=_model.compartment_names)
     derived_output_df = pd.DataFrame.from_dict(_model.derived_outputs)
     store_database(
         derived_output_df,
         table_name="derived_outputs",
         run_idx=self.iter_num,
         database_name=self.output_db_path,
         scenario=scenario_index,
     )
     store_database(
         out_df,
         table_name="outputs",
         run_idx=self.iter_num,
         times=_model.times,
         database_name=self.output_db_path,
         scenario=scenario_index,
     )
Exemple #3
0
 def store_model_outputs(self):
     """
     Record the model outputs in the database
     """
     scenario = self.latest_scenario
     assert scenario, "No model has been run"
     model = scenario.model
     out_df = pd.DataFrame(model.outputs, columns=model.compartment_names)
     derived_output_df = pd.DataFrame.from_dict(model.derived_outputs)
     store_database(
         derived_output_df,
         table_name="derived_outputs",
         run_idx=self.iter_num,
         database_path=self.output_db_path,
         scenario=scenario.idx,
     )
     store_database(
         out_df,
         table_name="outputs",
         run_idx=self.iter_num,
         times=model.times,
         database_path=self.output_db_path,
         scenario=scenario.idx,
     )
Exemple #4
0
    def loglikelihood(self, params, to_return=BEST_LL):
        """
        Calculate the loglikelihood for a set of parameters
        """
        scenario, pp = self.run_model_with_params(params)

        model_start_time = pp.derived_outputs["times"][0]
        considered_start_times = [model_start_time]
        best_start_time = None
        if self.run_mode == CalibrationMode.LEAST_SQUARES:
            # Initial best loglikelihood is a very large +ve number.
            best_ll = 1.0e60
        else:
            # Initial best loglikelihood is a very large -ve number.
            best_ll = -1.0e60

        for considered_start_time in considered_start_times:
            time_shift = considered_start_time - model_start_time
            ll = 0  # loglikelihood if using bayesian approach. Sum of squares if using lsm mode
            for target in self.targeted_outputs:
                key = target["output_key"]
                data = np.array(target["values"])
                time_weigths = target["time_weights"]
                if key in pp.generated_outputs:
                    model_output = np.array(pp.generated_outputs[key])
                else:
                    indices = []
                    for year in target["years"]:
                        time_idx = scenario.model.times.index(year -
                                                              time_shift)
                        indices.append(time_idx)

                    model_output = np.array(
                        [pp.derived_outputs[key][index] for index in indices])

                if self.run_mode == CalibrationMode.LEAST_SQUARES:
                    squared_distance = (data - model_output)**2
                    ll += np.sum([
                        w * d for (w, d) in zip(time_weigths, squared_distance)
                    ])
                else:
                    if "loglikelihood_distri" not in target:  # default distribution
                        target["loglikelihood_distri"] = "normal"
                    if target["loglikelihood_distri"] == "normal":
                        if key + "_dispersion_param" in self.param_list:
                            normal_sd = params[self.param_list.index(
                                key + "_dispersion_param")]
                        else:
                            normal_sd = target["sd"]
                        squared_distance = (data - model_output)**2
                        ll += -(0.5 / normal_sd**2) * np.sum([
                            w * d
                            for (w, d) in zip(time_weigths, squared_distance)
                        ])
                    elif target["loglikelihood_distri"] == "poisson":
                        for i in range(len(data)):
                            ll += (round(data[i]) * math.log(
                                abs(model_output[i])) - model_output[i] -
                                   math.log(math.factorial(round(
                                       data[i])))) * time_weigths[i]
                    elif target["loglikelihood_distri"] == "negative_binomial":
                        assert key + "_dispersion_param" in self.param_list
                        # the dispersion parameter varies during the MCMC. We need to retrieve its value
                        n = [
                            params[i] for i in range(len(params))
                            if self.param_list[i] == key + "_dispersion_param"
                        ][0]
                        for i in range(len(data)):
                            # We use the parameterisation based on mean and variance and assume define var=mean**delta
                            mu = model_output[i]
                            # work out parameter p to match the distribution mean with the model output
                            p = mu / (mu + n)
                            ll += stats.nbinom.logpmf(round(
                                data[i]), n, 1.0 - p) * time_weigths[i]
                    else:
                        raise ValueError(
                            "Distribution not supported in loglikelihood_distri"
                        )

            if self.run_mode == CalibrationMode.LEAST_SQUARES:
                is_new_best_ll = ll < best_ll
            else:
                is_new_best_ll = ll > best_ll

            if is_new_best_ll:
                best_ll, best_start_time = (ll, considered_start_time)

        if self.run_mode == CalibrationMode.LEAST_SQUARES:
            mcmc_run_dict = {k: v for k, v in zip(self.param_list, params)}
            mcmc_run_dict["loglikelihood"] = best_ll
            mcmc_run_colnames = self.param_list.copy()
            mcmc_run_colnames = mcmc_run_colnames.append("loglikelihood")
            mcmc_run_df = pd.DataFrame(mcmc_run_dict,
                                       columns=mcmc_run_colnames,
                                       index=[self.iter_num])
            store_database(
                mcmc_run_df,
                table_name="mcmc_run",
                run_idx=self.iter_num,
                database_path=self.output_db_path,
            )

        self.evaluated_params_ll.append(
            (copy.copy(params), copy.copy(best_ll)))

        if to_return == BEST_LL:
            return best_ll
        elif to_return == BEST_START:
            return best_start_time
        else:
            raise ValueError("to_return not recognised")
Exemple #5
0
    def loglikelihood(self, params, to_return="best_ll"):
        """
        defines the loglikelihood
        :param params: model parameters
        :return: the loglikelihood
        """
        # run the model
        best_ll = None
        # for evaluated in self.evaluated_params_ll:
        #     if np.array_equal(params, evaluated[0]):
        #         best_ll = evaluated[1]
        #         break

        if best_ll is None:
            self.run_model_with_params(params)

            model_start_time = self.post_processing.derived_outputs["times"][0]
            if self.start_time_range is None:
                considered_start_times = [model_start_time]
            else:
                considered_start_times = np.linspace(
                    self.start_time_range[0],
                    self.start_time_range[1],
                    num=self.start_time_range[1] - self.start_time_range[0] +
                    1,
                )

            best_ll, best_start_time = (-1.0e60, None)
            if self.run_mode == "lsm":
                best_ll = 1.0e60
            for considered_start_time in considered_start_times:
                time_shift = considered_start_time - model_start_time
                ll = 0  # loglikelihood if using bayesian approach. Sum of squares if using lsm mode
                for target in self.targeted_outputs:
                    key = target["output_key"]
                    data = np.array(target["values"])
                    if key in self.post_processing.generated_outputs:
                        if self.start_time_range is not None:
                            raise ValueError(
                                "variable start time implemented for derived_outputs only"
                            )
                        model_output = np.array(
                            self.post_processing.generated_outputs[key])
                    else:
                        indices = []
                        for year in target["years"]:
                            indices.append(self.scenarios[0].model.times.index(
                                year - time_shift))
                        model_output = np.array([
                            self.post_processing.derived_outputs[key][index]
                            for index in indices
                        ])

                    if self.run_mode == "lsm":
                        ll += np.sum((data - model_output)**2)
                    else:
                        if "loglikelihood_distri" not in target:  # default distribution
                            target["loglikelihood_distri"] = "normal"
                        if target["loglikelihood_distri"] == "normal":
                            ll += -(0.5 / target["sd"]**2) * np.sum(
                                (data - model_output)**2)
                        elif target["loglikelihood_distri"] == "poisson":
                            for i in range(len(data)):
                                ll += (
                                    data[i] * math.log(abs(model_output[i])) -
                                    model_output[i] -
                                    math.log(math.factorial(data[i])))
                        elif target[
                                "loglikelihood_distri"] == "negative_binomial":
                            assert key + '_dispersion_param' in self.param_list
                            # the dispersion parameter varies during the MCMC. We need to retrieve its value
                            n = [
                                params[i] for i in range(len(params))
                                if self.param_list[i] == key +
                                '_dispersion_param'
                            ][0]
                            for i in range(len(data)):
                                # We use the parameterisation based on mean and variance and assume define var=mean**delta
                                mu = model_output[i]
                                # work out parameter p to match the distribution mean with the model output
                                p = mu / (mu + n)
                                ll += stats.nbinom.logpmf(data[i], n, 1. - p)
                        else:
                            raise ValueError(
                                "Distribution not supported in loglikelihood_distri"
                            )

                if (ll > best_ll and self.run_mode != "lsm") or (
                        ll < best_ll and self.run_mode == "lsm"):
                    best_ll, best_start_time = (ll, considered_start_time)

            if self.run_mode == "lsm":
                mcmc_run_dict = {k: v for k, v in zip(self.param_list, params)}
                mcmc_run_dict["loglikelihood"] = best_ll
                mcmc_run_colnames = self.param_list.copy()
                mcmc_run_colnames = mcmc_run_colnames.append("loglikelihood")
                mcmc_run_df = pd.DataFrame(mcmc_run_dict,
                                           columns=mcmc_run_colnames,
                                           index=[self.iter_num])
                store_database(
                    mcmc_run_df,
                    table_name="mcmc_run",
                    run_idx=self.iter_num,
                    database_name=self.output_db_path,
                )
            self.evaluated_params_ll.append(
                (copy.copy(params), copy.copy(best_ll)))

        if to_return == "best_ll":
            return best_ll
        elif to_return == "best_start_time":
            return best_start_time
        else:
            raise ValueError("to_return not recognised")