def calibrate_noise_model(benchmarks, all_node_results, run_name=None, model_filename="models/noise-with-outliers.stan", iterations=9000, warmup=8000, chains=1): """ Run the given noise model for the benchmark stars. """ if run_name is None: run_name = "unnamed" else: # If a name has been given, use a timestamp too. run_name = "-".join([run_name, format(md5(ctime().encode("utf-8")).hexdigest())]) # Check for a compiled version of this model. basename, ext = os.path.splitext(model_filename) if os.path.exists(basename + ".pkl"): # There's a compiled version. Use that. model_filename = basename + ".pkl" logging.info("Using pre-compiled model {0}".format(model_filename)) with open(model_filename, "rb") as fp: model = pickle.load(fp) else: # Compilation required. model = StanModel(model_filename) pickled_model_filename = basename + ".pkl" logging.info("Pickling compiled model to {0}".format(pickled_model_filename)) with open(pickled_model_filename, "wb") as fp: pickle.dump(model, fp) data, node_names = build_data_dict(benchmarks, all_node_results) logging.info("Optimizing...") op = model.optimizing(data=data) logging.info("Optimized Values: \n{0}".format(op["par"])) logging.info("Fitting...") calibrated_model = model.sampling(data=data, pars=op["par"], iter=iterations, warmup=warmup, chains=chains) # Add the node names into the data dict. calibrated_model.data["node_names"] = node_names return calibrated_model
def test_matrix_param_order_optimizing(): model_code = """ data { int<lower=2> K; } parameters { matrix[K,2] beta; } model { for (k in 1:K) beta[k,1] ~ normal(0,1); for (k in 1:K) beta[k,2] ~ normal(100,1); }""" sm = StanModel(model_code=model_code) op = sm.optimizing(data=dict(K=3)) beta = op['par']['beta'] assert beta.shape == (3, 2) beta_colmeans = np.mean(beta, axis=0) assert beta_colmeans[0] < 4 assert beta_colmeans[1] > 100 - 4
stan_code = f.read() stan_data_dict = { 'N': data_wc.target.size, 'p': 1, #data_wc.data.shape[1], 'X': data_wc.data[:, [0]], 'y': data_wc.target } tmpHeader = "my-multiply.txt" with open("tmp.hpp", "w") as f: with open(tmpHeader, "r") as f_cur: tmpHeaderTxt = f_cur.read() f.write(tmpHeaderTxt) if False: bayes_logreg2_native = StanModel(model_code=stan_code) # stan_fit_native = bayes_logreg2_native.sampling(data=stan_data_dict, iter=2000, chains=2); stan_optim2 = bayes_logreg2_native.optimizing(data=stan_data_dict, iter=50000) #print("optimum value of original (sq) NLL is {}".format(stan_optim2)) print("=================================") print("Now onto the customised version...") with open("stan_logreg_2my.txt") as f: stan_code_my = f.read() bayes_logreg2_my = StanModel(model_code=stan_code_my, allow_undefined=True, includes=['tmp.hpp'], include_dirs=["."]) stan_optim2_my = bayes_logreg2_my.optimizing(data=stan_data_dict, iter=50000)
# CAR with count features # ============================================================================= #Compile stan model car_model = StanModel(file = w_dir + '/code/kdd_modelling/Stan/poisson_sparse_CAR.stan') #Define max_D for CAR models max_D = 1 #Get data + fit model X_train, X_test, _, y_train, y_test, _, _, _,_,_ = data_pipeline(feature_engineering = True, feature_type = 'count') stan_data = dict(y_train = y_train.astype(np.int64).squeeze(), y_test = y_test.astype(np.int64).squeeze(), p = X_train.shape[1] - 2, N_train = X_train.shape[0], N_test = X_test.shape[0], X_train = X_train[:,2:], X_test = X_test[:,2:], loc_train = X_train[:,0:2], loc_test = X_test[:,0:2], max_D = max_D) model_fit = car_model.optimizing(data = stan_data,iter = 20000,init="0") pred_oos = model_fit['pred_test'] pred_is = model_fit['pred_train'] rmse_oos = np.sqrt(np.mean((pred_oos-y_test.squeeze())**2)) lik_oos = np.sum(model_fit['log_lik_test']) rmse_is = np.sqrt(np.mean((pred_is-y_train.squeeze())**2)) lik_is = np.sum(model_fit['log_lik']) row = pd.Series({'RMSE_IS':rmse_is,'LIK_IS':lik_is,'RMSE_OOS':rmse_oos,'LIK_OOS':lik_oos}, name='CAR model (count)') results_tbl = results_tbl.append(row) # ============================================================================= # CAR with dist features # ============================================================================= X_train, X_test, _, y_train, y_test, _, _, _,_,_ = data_pipeline(feature_engineering = True, feature_type = 'dist')
initBeta = [{'beta': np.arange(-3, 3, 0.2) * 0.05}] tmpHeader = "my-dotprod.hpp" with open("tmp.hpp", "w") as f: with open(tmpHeader, "r") as f_cur: tmpHeaderTxt = f_cur.read() f.write(tmpHeaderTxt) initBeta = [{'beta': np.arange(-3, 3, 0.2) * 0.05}] if False: bayes_logreg3_native = StanModel(model_code=stan_code) # stan_fit_native = bayes_logreg2_native.sampling(data=stan_data_dict, iter=2000, chains=2); stan_optim3 = bayes_logreg3_native.optimizing(data=stan_data_dict, iter=50000, init=initBeta) if True: #print("optimum value of original (sq) NLL is {}".format(stan_optim2)) print("=================================") print("Now onto the customised version...") with open("stan_logreg_3my.txt") as f: stan_code_my = f.read() if False: bayes_logreg3_my = StanModel(model_code=stan_code_my, allow_undefined=True, includes=['tmp.hpp'], include_dirs=["."]) bayes_logreg3_my = StanModel(model_code=stan_code_my)
increment_log_prob(log_sum_exp( log1m_alpha + normal_log(sp_vector[1], outlier_teff_mu, outlier_teff_sigma), log1m_alpha + normal_log(sp_vector[2], outlier_teff_mu, outlier_teff_sigma) )); } }""" # Ok, here is our toy data: with open("toy.data", "r") as fp: data = json.load(fp) model = StanModel(model_code=model_code) print("Optimizing...") op = model.optimizing(data=data) print("Fitting...") fit = model.sampling(data=data, pars=op["par"], iter=20000) subplots_adjust = { "left": 0.10, "bottom": 0.05, "right": 0.95, "top": 0.95, "wspace": 0.20, "hspace": 0.45 } nodes = range(2) dimensions = ("teff", "logg") # Plot the m, b parameters for each node dimensions_traced = [] for node in nodes: node_dimensions = \
class QM: """ Base class for quantile matching (QM) models Parameters __________ parameters_dict : Dict Keys are internal names for the priors of the model e.g. 'mu', 'sigma' for a GaussianQM. Values are the user defined Distributions. Note key must not be identical to value.name. """ def __init__(self, parameters_dict: Dict[str, Distribution]): self.parameters_dict = self._check_dict(parameters_dict) self.model = None def __str__(self): return self.__class__.__name__ + '(' + \ ', '.join([p.__str__() for p in self.parameters_dict.values()]) + \ ')' def __repr__(self): return self.__str__() def _check_dict(self, parameters_dict:Dict[str, Distribution]): for key, value in parameters_dict.items(): if not isinstance(value, Distribution): raise ValueError(f'Input parameter "{key}" of "{self.__class__.__name__}" needs to be a Distribution (see bqme.distributions), but is of type {type(value)}.') return parameters_dict def _template_replacements(self) -> Dict['str', 'str']: """ returns a dict that contains keys as template variables and values are the strings for the variables. Necessary keys: parametersnames, parameters, priors, cdf, lpdf, rng """ distribution_name = self.__class__.__name__.replace("QM", "").lower() build = lambda s: '\n '.join([ p.code()[s] for p in self.parameters_dict.values() ]) replacements = { 'parametersnames' : ', '.join([ p.name for p in self.parameters_dict.values() ]), 'parameters' : build('parameter'), 'priors' : build('prior'), 'cdf' : f'{distribution_name}_cdf', 'lpdf' : f'{distribution_name}_lpdf', 'rng' : f'{distribution_name}_rng', } return replacements def _stan_code(self) -> str: with open(STAN_TEMPLATE_PATH) as f: code = f.read() for k, v in self._template_replacements().items(): code = code.replace(f'${k}$', v) return code def _check_domain(self, X): minn, maxx = self.domain() f = lambda x: not(minn < x < maxx) if len(list(filter(f, X))) > 0: raise ValueError(f'some elements of X are not in the domain of the model, which is ({minn}, {maxx}).') def domain(self): pass @property def code(self) -> str: """ returns the final stan code """ return self._stan_code() def compile(self): self.model = StanModel(model_code=self.code) def sampling(self, N:int, q:Tuple[float,...], X:Tuple[float,...]) -> 'StanFit4Model': self._check_domain(X) if self.model is None: self.compile() data_dict = {'N':N, 'M':len(q), 'q':q, 'X':X} samples = self.model.sampling(data=data_dict) return FitObjectSampling(self, samples) def optimizing(self, N:int, q:Tuple[float,...], X:Tuple[float,...]) -> 'StanFit4Model': self._check_domain(X) if self.model is None: self.compile() data_dict = {'N':N, 'M':len(q), 'q':q, 'X':X} opt = self.model.optimizing(data=data_dict) return FitObjectOptimizing(self, opt)
def test_optimizing_basic(): sm = StanModel(model_code='parameters {real y;} model {y ~ normal(0,1);}') op = sm.optimizing() assert op['par']['y'].shape == () assert abs(op['par']['y']) < 1
#Get data + fit model X_train, X_test, _, y_train, y_test, _, _, _, _, _ = data_pipeline( feature_engineering=True, feature_type='count') stan_data = dict(y_train=y_train.squeeze(), y_test=y_test.squeeze(), p=X_train.shape[1] - 2, N_train=X_train.shape[0], N_test=X_test.shape[0], X_train=X_train[:, 2:], X_test=X_test[:, 2:], loc_train=X_train[:, 0:2], loc_test=X_test[:, 0:2], max_D=max_D) model_fit = car_model.optimizing(data=stan_data, iter=20000, seed=1) pred_oos = model_fit['pred_test'] pred_is = model_fit['pred_train'] rmse_oos = np.sqrt(np.mean((pred_oos - y_test.squeeze())**2)) lik_oos = np.sum(model_fit['log_lik_test']) rmse_is = np.sqrt(np.mean((pred_is - y_train.squeeze())**2)) lik_is = np.sum(model_fit['log_lik']) row = pd.Series( { 'RMSE_IS': rmse_is, 'LIK_IS': lik_is, 'RMSE_OOS': rmse_oos, 'LIK_OOS': lik_oos }, name='CAR model (count)') results = results.append(row)