def test_scipy_optimizer(seed): np.random.seed(seed) num_obs = 20 df = pd.DataFrame({ "obs": np.random.randn(num_obs), "cov0": np.random.randn(num_obs), "cov1": np.random.randn(num_obs) }) data = Data(col_obs="obs", col_covs=["cov0", "cov1"], df=df) spline_specs = SplineSpecs(knots=np.linspace(0.0, 1.0, 5), degree=3, knots_type="rel_domain") var_cov0 = Variable(name="cov0") var_cov1 = SplineVariable(name="cov1", spline_specs=spline_specs) model = GaussianModel( data, param_specs={"mu": { "variables": [var_cov0, var_cov1] }}) coefs = scipy_optimize(model) tr_coef = np.linalg.solve( (model.mat[0].T * model.data.weights).dot(model.mat[0]), (model.mat[0].T * model.data.weights).dot(model.data.obs)) assert np.allclose(coefs, tr_coef)
def data(): num_obs = 5 df = pd.DataFrame({ "obs": np.random.randn(num_obs), "cov0": np.random.randn(num_obs), "cov1": np.random.randn(num_obs) }) return Data(col_obs="obs", col_covs=["cov0", "cov1"], df=df)
def __init__(self, data: Data, **kwargs): if not np.all(data.obs >= 0): raise ValueError( "Binomial model requires observations to be non-negative.") if len(data.col_obs) != 2: raise ValueError("Binomial model need 2 columns of observations, " "one for number of events, one for sample size.") if any(np.diff(data.get_cols(data.col_obs), axis=1) < 0): raise ValueError( "Binomial model requires number of events less or equal than sample size." ) self.obs_1s = data.get_cols(data.col_obs[0]) self.obs_0s = np.diff(data.get_cols(data.col_obs), axis=1).ravel() self.obs_sample_sizes = data.get_cols(data.col_obs[1]) super().__init__(data, **kwargs)
def data(): num_obs = 5 obs_1s = np.random.rand(num_obs) * 10 df = pd.DataFrame({ "obs_1s": obs_1s, "obs_sample_sizes": obs_1s + 1.0, "cov0": np.random.randn(num_obs), "cov1": np.random.randn(num_obs) }) return Data(col_obs=["obs_1s", "obs_sample_sizes"], col_covs=["cov0", "cov1"], df=df)
def get_mat(self, data: Data) -> np.ndarray: """Get design matrix. Parameters ---------- data : Data Data object that provides the covariates. Returns ------- np.ndarray Design matrix. """ self.check_data(data) return data.get_covs(self.name)
def get_mat(self, data: Data) -> np.ndarray: """Get design matrix. Parameters ---------- data : Data Data object that provides the covariates. Returns ------- np.ndarray Design matrix. """ self.check_data(data) cov = data.get_cols(self.name) return self.spline.design_mat(cov, l_extra=True, r_extra=True)
def check_data(self, data: Data): """Check if the data contains the column name `name`. And create the spline object, if only `spline_specs` is provided. Parameters ---------- data : Data Data object to be checked. """ super().check_data(data) if self.spline is None: cov = data.get_cols(self.name) self.spline = self.spline_specs.create_spline(cov) for prior in self.linear_upriors + self.linear_gpriors: if isinstance(prior, SplinePrior): prior.attach_spline(self.spline)
def test_no_match_col_obs(df): df = df.drop(COL_OBS, axis=1) data = Data(col_obs=COL_OBS, df=df) assert all(np.isnan(data.obs))
def test_mult_obs(df): data = Data(col_obs=[COL_OBS, COL_COVS[0]], df=df) assert len(data.col_obs) == 2 obs = data.obs assert obs.shape == (data.num_obs, 2)
def test_no_obs(df): data = Data(col_covs=COL_COVS, df=df) with pytest.raises(ValueError): data.obs
def test_attach_df(df): data = Data(COL_OBS, COL_COVS, COL_WEIGHTS, COL_OFFSET) assert data.is_empty() data.attach_df(df) assert data.num_obs == NUM_OBS
def test_post_init_fill_df(df_simple): data = Data(COL_OBS, df=df_simple) assert data.num_obs == NUM_OBS assert all(data.weights == 1.0) assert all(data.offset == 0.0) assert all(data.get_cols('intercept') == 1.0)
def test_post_init_empty(): data = Data(COL_OBS) assert data.is_empty()
def test_init(df): data = Data(COL_OBS, COL_COVS, COL_WEIGHTS, COL_WEIGHTS, df) assert data.num_obs == NUM_OBS
def data(df): return Data(COL_OBS, COL_COVS, COL_WEIGHTS, COL_WEIGHTS, df)
def main(): # load data df_all = pd.read_csv(data_path) df_all = df_all[df_all.include].reset_index(drop=True) national_index = df_all.ihme_loc_id.str.len() == 3 df_national = df_all[national_index].reset_index(drop=True) df_subnational = df_all[~national_index].reset_index(drop=True) df = df_national # create results folder if not results_path.exists(): results_path.mkdir() # Fit global IDR model idr_model = ExcessMortalityModel(df, [idr_model_variables], col_obs="logit_ratio") idr_model.run_models() # attach data to create spline data = Data(col_obs="logit_ratio", col_covs=[ intercept_variable.name, idr_variable.name, time_variable.name ]) data.df = df idr_variable.check_data(data) time_variable.check_data(data) # fix idr coefficients coefs = idr_model.results[0]["coefs"][1:] idr_variable.add_priors(UniformPrior(lb=coefs, ub=coefs)) # getting location structure location_structure = {} for super_region in df.super_region_name.unique(): regions = df[df.super_region_name == super_region].region_name.unique() location_structure[super_region] = {} for region in regions: nationals = list(df_national[df_national.region_name == region].ihme_loc_id.unique()) location_structure[super_region][region] = {} for national in nationals: subnational_index = df_subnational.ihme_loc_id.str.startswith( national) location_structure[super_region][region][national] = list( df_subnational.ihme_loc_id[subnational_index].unique()) # construct cascade model # global model global_model = Cascade(df, cascade_specs, level_id=0, name="Global") # super region model super_region_models = [ Cascade( df[df.super_region_name == super_region].reset_index(drop=True), cascade_specs, level_id=1, name=super_region) for super_region in df.super_region_name.unique() ] # region model region_models = [ Cascade(df[df.region_name == region].reset_index(drop=True), cascade_specs, level_id=2, name=region) for region in df.region_name.unique() ] # national model national_models = [ Cascade(df_national[df_national.ihme_loc_id == national].reset_index( drop=True), cascade_specs, level_id=3, name=national) for national in df_national.ihme_loc_id.unique() ] # subnational model subnational_models = [ Cascade(df_subnational[df_subnational.ihme_loc_id == subnational].reset_index(drop=True), cascade_specs, level_id=4, name=subnational) for subnational in df_subnational.ihme_loc_id.unique() ] # link all models together link_cascade_models(global_model, [ super_region_models, region_models, national_models, subnational_models ], location_structure) # fit model global_model.run_models() # create plots model_list = global_model.to_list() # predict pred_dfs = {} for cmodel in model_list: pred_dfs[cmodel.name] = predict(cmodel.df, cmodel.model) # plot for loc_id in df_all.ihme_loc_id.unique(): df_sub = df_all[df_all.ihme_loc_id == loc_id] loc_structure = [ "Global", df_sub.super_region_name.values[0], df_sub.region_name.values[0] ] if len(loc_id) > 3: loc_structure.extend([loc_id[:3], loc_id]) else: loc_structure.append(loc_id) plot_model(df_all, pred_dfs, loc_structure) plt.savefig(results_path / f"{loc_id}.pdf", bbox_inches="tight") plt.close("all") # # create results dataframe # coefs = pd.concat([model.model.get_coefs_df() for model in model_list]) # coefs["location"] = [model.name for model in model_list] # coefs.to_csv(results_path / "coefs.csv", index=False) # # create samples of the coefficient # for cmodel in model_list: # df_coefs = sample_coefs(cmodel) # df_coefs.to_csv(results_path / f"cdraws_{cmodel.level_id}_{cmodel.name}.csv", index=False) return model_list