def test_intercept_only_model(crossed_data): # using fit model0 = Model(crossed_data) model0.fit('Y ~ 1', run=False) model0.build(backend='pymc3') model0.fit(samples=1) # using add model1 = Model(crossed_data) model1.add('Y ~ 0') model1.add('1') model1.build(backend='pymc3') model1.fit(samples=1) # check that fit and add models have same priors for fixed # effects priors0 = { x.name: x.prior.args for x in model0.terms.values() if not x.random } priors1 = { x.name: x.prior.args for x in model1.terms.values() if not x.random } assert set(priors0) == set(priors1)
def test_distribute_group_specific_effect_over(diabetes_data): # 163 unique levels of BMI in diabetes_data # With intercept model = Model("BP ~ (C(age_grp)|BMI)", diabetes_data) model.build() # Treatment encoding because of the intercept lvls = sorted(list(diabetes_data["age_grp"].unique()))[1:] assert "C(age_grp)|BMI" in model.terms assert "1|BMI" in model.terms assert model.terms["C(age_grp)|BMI"].pymc_coords["C(age_grp)_coord_group_expr"] == lvls # This is equal to the sub-matrix of Z that corresponds to this term. # 442 is the number of observations. 163 the number of groups. # 2 is the number of levels of the categorical variable 'C(age_grp)' after removing # the reference level. Then the number of columns is 326 = 163 * 2. assert model.terms["C(age_grp)|BMI"].data.shape == (442, 326) # Without intercept. Reference level is not removed. model = Model("BP ~ (0 + C(age_grp)|BMI)", diabetes_data) model.build() assert "C(age_grp)|BMI" in model.terms assert not "1|BMI" in model.terms assert model.terms["C(age_grp)|BMI"].data.shape == (442, 489)
def test_cell_means_with_covariate(crossed_data): # build model using formula model0 = Model(crossed_data) model0.fit('Y ~ 0 + threecats + continuous', run=False) model0.build() model0.fit(samples=1) # build model using add_term model1 = Model(crossed_data) model1.add_y('Y') model1.add_term('threecats', drop_first=False) model1.add_term('continuous') model1.build() model1.fit(samples=1) # check that design matries are the same, # even if term names / level names / order of columns is different X0 = set([tuple(t.data[:,lev]) for t in model0.fixed_terms.values() for lev in range(len(t.levels))]) X1 = set([tuple(t.data[:,lev]) for t in model1.fixed_terms.values() for lev in range(len(t.levels))]) assert X0 == X1 # check that threecats priors have finite variance assert not any(np.isinf(model0.terms['threecats'].prior.args['sd'])) # check that add_formula and add_term models have same priors for fixed effects priors0 = {x.name:x.prior.args for x in model0.terms.values() if not x.random} priors1 = {x.name:x.prior.args for x in model1.terms.values() if not x.random} assert set(priors0) == set(priors1)
def test_many_fixed_effects(crossed_data): # build model using formula model0 = Model(crossed_data) model0.fit('Y ~ continuous + dummy + threecats', run=False) model0.build() model0.fit(samples=1) # build model using add_term model1 = Model(crossed_data) model1.add_y('Y') model1.add_intercept() model1.add_term('continuous') model1.add_term('dummy') model1.add_term('threecats') model1.build() model1.fit(samples=1) # check that term names agree assert set(model0.term_names) == set(model1.term_names) # check that design matries are the same, # even if term names / level names / order of columns is different X0 = set([tuple(t.data[:,lev]) for t in model0.fixed_terms.values() for lev in range(len(t.levels))]) X1 = set([tuple(t.data[:,lev]) for t in model1.fixed_terms.values() for lev in range(len(t.levels))]) assert X0 == X1 # check that add_formula and add_term models have same priors for fixed effects priors0 = {x.name:x.prior.args for x in model0.terms.values() if not x.random} priors1 = {x.name:x.prior.args for x in model1.terms.values() if not x.random} assert set(priors0) == set(priors1)
def test_cell_means_with_random_intercepts(crossed_data): # using formula model0 = Model(crossed_data) model0.fit('Y ~ 0 + threecats', random=['subj'], run=False) model0.build() model0.fit(samples=1) # using add_term model1 = Model(crossed_data, intercept=False) model1.add_y('Y') model1.add_term('threecats', categorical=True, drop_first=False) model1.add_term('subj', categorical=True, random=True, drop_first=False) model1.build() model1.fit(samples=1) # check that they have the same random terms assert set(model0.random_terms) == set(model1.random_terms) # check that fixed effect design matries are the same, # even if term names / level names / order of columns is different X0 = set([tuple(t.data[:,lev]) for t in model0.fixed_terms.values() for lev in range(len(t.levels))]) X1 = set([tuple(t.data[:,lev]) for t in model1.fixed_terms.values() for lev in range(len(t.levels))]) assert X0 == X1 # check that add_formula and add_term models have same priors for fixed effects priors0 = {x.name:x.prior.args for x in model0.terms.values() if not x.random} priors1 = {x.name:x.prior.args for x in model1.terms.values() if not x.random} assert set(priors0) == set(priors1) # check that add_formula and add_term models have same priors for random effects priors0 = {x.name:x.prior.args['sd'].args for x in model0.terms.values() if x.random} priors1 = {x.name:x.prior.args['sd'].args for x in model1.terms.values() if x.random} assert set(priors0) == set(priors1)
def test_one_shot_formula_fit(diabetes_data): model = Model(diabetes_data) model.fit('S3 ~ S1 + S2', samples=50, run=False) model.build(backend='pymc3') nv = model.backend.model.named_vars targets = ['S3', 'S1', 'Intercept'] assert len(set(nv.keys()) & set(targets)) == 3
def test_cell_means_parameterization(crossed_data): # build model using fit model0 = Model(crossed_data) model0.fit("Y ~ 0 + threecats", run=False) model0.build(backend="pymc3") model0.fit(tune=0, samples=1, init=None) # build model using add model1 = Model(crossed_data) model1.add("Y ~ 0") model1.add("0 + threecats") model1.build(backend="pymc3") model1.fit(tune=0, samples=1) # check that design matrices are the same, # even if term names / level names / order of columns is different X0 = set( [tuple(t.data[:, lev]) for t in model0.fixed_terms.values() for lev in range(len(t.levels))] ) X1 = set( [tuple(t.data[:, lev]) for t in model1.fixed_terms.values() for lev in range(len(t.levels))] ) assert X0 == X1 # check that fit and add models have same priors for fixed # effects priors0 = {x.name: x.prior.args for x in model0.terms.values() if not x.random} priors1 = {x.name: x.prior.args for x in model1.terms.values() if not x.random} assert set(priors0) == set(priors1)
def test_one_shot_formula_fit(diabetes_data): model = Model(diabetes_data) model.fit("S3 ~ S1 + S2", samples=50, run=False) model.build(backend="pymc3") nv = model.backend.model.named_vars targets = ["S3", "S1", "Intercept"] assert len(set(nv.keys()) & set(targets)) == 3
def test_model_term_names_property(diabetes_data): model = Model(diabetes_data) model.add("BMI ~ age_grp") model.add("BP") model.add("S1") model.build(backend="pymc") assert model.term_names == ["Intercept", "age_grp", "BP", "S1"]
def test_plot_priors(crossed_data): model = Model("Y ~ 0 + threecats", crossed_data) # Priors cannot be plotted until model is built. with pytest.raises(ValueError): model.plot_priors() model.build() model.plot_priors()
def test_model_graph(crossed_data): model = Model("Y ~ 0 + threecats", crossed_data) # Graph cannot be plotted until model is built. with pytest.raises(ValueError): model.graph() model.build() model.graph()
def test_auto_scale(diabetes_data): # By default, should scale everything except custom Prior() objects model = Model(diabetes_data) priors = {"S1": 0.3, "BP": Prior("Cauchy", alpha=1, beta=17.5)} model.fit("BMI ~ S1 + S2 + BP", run=False, priors=priors) model.build(backend="pymc3") p1 = model.terms["S1"].prior p2 = model.terms["S2"].prior p3 = model.terms["BP"].prior assert p1.name == p2.name == "Normal" assert 0 < p1.args["sd"] < 1 assert p2.args["sd"] > p1.args["sd"] assert p3.name == "Cauchy" assert p3.args["beta"] == 17.5 # With auto_scale off, everything should be flat unless explicitly named # in priors model = Model(diabetes_data, auto_scale=False) model.fit("BMI ~ S1 + S2 + BP", run=False, priors=priors) model.build(backend="pymc3") p1_off = model.terms["S1"].prior p2_off = model.terms["S2"].prior p3_off = model.terms["BP"].prior assert p1_off.name == "Normal" assert p2_off.name == "Flat" assert 0 < p1_off.args["sd"] < 1 assert "sd" not in p2_off.args assert p3_off.name == "Cauchy" assert p3_off.args["beta"] == 17.5
def test_slope_only_model(crossed_data): # using fit model0 = Model(crossed_data) model0.fit('Y ~ 0 + continuous', run=False) model0.build(backend='pymc3') model0.fit(tune=0, samples=1, init=None) # using add model1 = Model(crossed_data) model1.add('Y ~ 0') model1.add('0 + continuous') model1.build(backend='pymc3') model1.fit(tune=0, samples=1) # check that term names agree assert set(model0.term_names) == set(model1.term_names) # check that fit and add models have same priors for fixed # effects priors0 = { x.name: x.prior.args for x in model0.terms.values() if not x.random } priors1 = { x.name: x.prior.args for x in model1.terms.values() if not x.random } assert set(priors0) == set(priors1)
def test_model_term_names_property(diabetes_data): model = Model(diabetes_data) model.add('BMI ~ age_grp') model.add('BP') model.add('S1') model.build(backend='pymc') assert model.term_names == ['Intercept', 'age_grp', 'BP', 'S1']
def test_auto_scale(diabetes_data): # By default, should scale everything except custom Prior() objects model = Model(diabetes_data) priors = {'S1': 0.3, 'BP': Prior('Cauchy', alpha=1, beta=17.5)} model.fit('BMI ~ S1 + S2 + BP', run=False, priors=priors) model.build(backend='pymc3') p1 = model.terms['S1'].prior p2 = model.terms['S2'].prior p3 = model.terms['BP'].prior assert p1.name == p2.name == 'Normal' assert 0 < p1.args['sd'] < 1 assert p2.args['sd'] > p1.args['sd'] assert p3.name == 'Cauchy' assert p3.args['beta'] == 17.5 # With auto_scale off, everything should be flat unless explicitly named # in priors model = Model(diabetes_data, auto_scale=False) model.fit('BMI ~ S1 + S2 + BP', run=False, priors=priors) model.build(backend='pymc3') p1_off = model.terms['S1'].prior p2_off = model.terms['S2'].prior p3_off = model.terms['BP'].prior assert p1_off.name == 'Normal' assert p2_off.name == 'Flat' assert 0 < p1_off.args['sd'] < 1 assert 'sd' not in p2_off.args assert p3_off.name == 'Cauchy' assert p3_off.args['beta'] == 17.5
def test_cell_means_with_covariate(crossed_data): # build model using fit model0 = Model(crossed_data) model0.fit("Y ~ 0 + threecats + continuous", run=False) model0.build(backend="pymc3") # model0.fit(tune=0, samples=1) # build model using add model1 = Model(crossed_data) model1.add("Y ~ 0") model1.add("0 + threecats") model1.add("0 + continuous") model1.build(backend="pymc3") # model1.fit(tune=0, samples=1) # check that design matrices are the same, # even if term names / level names / order of columns is different X0 = set( [tuple(t.data[:, lev]) for t in model0.fixed_terms.values() for lev in range(len(t.levels))] ) X1 = set( [tuple(t.data[:, lev]) for t in model1.fixed_terms.values() for lev in range(len(t.levels))] ) assert X0 == X1 # check that threecats priors have finite variance assert not any(np.isinf(model0.terms["threecats"].prior.args["sd"])) # check that fit and add models have same priors for fixed # effects priors0 = {x.name: x.prior.args for x in model0.terms.values() if not x.random} priors1 = {x.name: x.prior.args for x in model1.terms.values() if not x.random} assert set(priors0) == set(priors1)
def test_model_term_names_property_interaction(crossed_data): crossed_data["fourcats"] = sum([[x] * 10 for x in ["a", "b", "c", "d"]], list()) * 3 model = Model("Y ~ threecats*fourcats", crossed_data) model.build() assert model.term_names == [ "Intercept", "threecats", "fourcats", "threecats:fourcats" ]
def test_many_random_effects(crossed_data): # build model using formula model0 = Model(crossed_data) model0.fit('Y ~ continuous', random=['0+threecats|subj','continuous|item','dummy|item','threecats|site'], run=False) model0.build() # model0.fit(samples=1) # build model using add_term model1 = Model(crossed_data) model1.add_y('Y') # fixed effects model1.add_intercept() model1.add_term('continuous') # random effects model1.add_term('threecats', over='subj', drop_first=False, random=True, categorical=True) model1.add_term('item', random=True, categorical=True) model1.add_term('continuous', over='item', random=True) model1.add_term('dummy', over='item', random=True) model1.add_term('site', random=True, categorical=True) model1.add_term('threecats', over='site', random=True, categorical=True) model1.build() # model1.fit(samples=1) # check that the random effects design matrices have the same shape X0 = pd.concat([pd.DataFrame(t.data) if not isinstance(t.data, dict) else pd.concat([pd.DataFrame(t.data[x]) for x in t.data.keys()], axis=1) for t in model0.random_terms.values()], axis=1) X1 = pd.concat([pd.DataFrame(t.data) if not isinstance(t.data, dict) else pd.concat([pd.DataFrame(t.data[x]) for x in t.data.keys()], axis=1) for t in model0.random_terms.values()], axis=1) assert X0.shape == X1.shape # check that the random effect design matrix contain the same columns, # even if term names / columns names / order of columns is different X0_set = set(tuple(X0.iloc[:,i]) for i in range(len(X0.columns))) X1_set = set(tuple(X1.iloc[:,i]) for i in range(len(X1.columns))) assert X0_set == X1_set # check that fixed effect design matries are the same, # even if term names / level names / order of columns is different X0 = set([tuple(t.data[:,lev]) for t in model0.fixed_terms.values() for lev in range(len(t.levels))]) X1 = set([tuple(t.data[:,lev]) for t in model1.fixed_terms.values() for lev in range(len(t.levels))]) assert X0 == X1 # check that add_formula and add_term models have same priors for fixed effects priors0 = {x.name:x.prior.args for x in model0.terms.values() if not x.random} priors1 = {x.name:x.prior.args for x in model1.terms.values() if not x.random} assert set(priors0) == set(priors1) # check that add_formula and add_term models have same priors for random effects priors0 = {x.name:x.prior.args['sd'].args for x in model0.terms.values() if x.random} priors1 = {x.name:x.prior.args['sd'].args for x in model1.terms.values() if x.random} assert set(priors0) == set(priors1)
def test_add_formula_append(diabetes_data): model = Model(diabetes_data) model.add('S3 ~ 0') model.add('S1') model.build(backend='pymc') assert hasattr(model, 'y') and model.y is not None and model.y.name == 'S3' assert 'S1' in model.terms model.add('S2', append=False) assert model.y is None model.add('S3 ~ 0') model.build(backend='pymc') assert 'S2' in model.terms assert 'S1' not in model.terms
def test_add_formula_append(diabetes_data): model = Model(diabetes_data) model.add("S3 ~ 0") model.add("S1") model.build(backend="pymc") assert hasattr(model, "y") and model.y is not None and model.y.name == "S3" assert "S1" in model.terms model.add("S2", append=False) assert model.y is None model.add("S3 ~ 0") model.build(backend="pymc") assert "S2" in model.terms assert "S1" not in model.terms
def test_distribute_random_effect_over(diabetes_data): # Random slopes model = Model(diabetes_data) model.add('BP ~ 1') model.add(random='C(age_grp)|BMI') model.build(backend='pymc') assert model.terms['C(age_grp)[T.1]|BMI'].data.shape == (442, 163) # Nested or crossed random intercepts model.reset() model.add('BP ~ 1') model.add(random='0+C(age_grp)|BMI') model.build(backend='pymc') assert model.terms['C(age_grp)[0]|BMI'].data.shape == (442, 163)
def test_model_terms_cleaned_levels_interaction(crossed_data): crossed_data["fourcats"] = sum([[x] * 10 for x in ["a", "b", "c", "d"]], list()) * 3 model = Model("Y ~ threecats*fourcats", crossed_data) model.build() assert model.terms["threecats:fourcats"].cleaned_levels == [ "threecats[b]:fourcats[b]", "threecats[b]:fourcats[c]", "threecats[b]:fourcats[d]", "threecats[c]:fourcats[b]", "threecats[c]:fourcats[c]", "threecats[c]:fourcats[d]", ]
def test_empty_model(crossed_data): model0 = Model(crossed_data) model0.add("Y ~ 0") model0.build(backend="pymc3") model0.fit(tune=0, samples=1) model1 = Model(crossed_data) model1.fit("Y ~ 0", run=False) model1.build(backend="pymc3") model1.fit(tune=0, samples=1) # check that both models have same priors for fixed effects priors0 = {x.name: x.prior.args for x in model0.terms.values() if not x.random} priors1 = {x.name: x.prior.args for x in model1.terms.values() if not x.random} assert set(priors0) == set(priors1)
def test_categorical_term(): data = pd.DataFrame({ "y": np.random.normal(size=6), "x1": np.random.normal(size=6), "x2": [1, 1, 0, 0, 1, 1], "g1": ["a"] * 3 + ["b"] * 3, "g2": ["x", "x", "z", "z", "y", "y"], }) model = Model("y ~ x1 + x2 + g1 + (g1|g2) + (x2|g2)", data) model.build() terms = ["x1", "x2", "g1", "1|g2", "g1[b]|g2", "x2|g2"] expecteds = [False, False, True, False, True, False] for term, expected in zip(terms, expecteds): assert model.terms[term].categorical is expected
def test_derived_term_search(diabetes_data): model = Model(diabetes_data) model.add('BMI ~ 1', random='age_grp|BP', categorical=['age_grp']) model.build(backend='pymc') terms = model._match_derived_terms('age_grp|BP') names = set([t.name for t in terms]) assert names == {'1|BP', 'age_grp[T.1]|BP', 'age_grp[T.2]|BP'} term = model._match_derived_terms('1|BP')[0] assert term.name == '1|BP' # All of these should find nothing assert model._match_derived_terms('1|ZZZ') is None assert model._match_derived_terms('ZZZ|BP') is None assert model._match_derived_terms('BP') is None assert model._match_derived_terms('BP') is None
def test_derived_term_search(diabetes_data): model = Model(diabetes_data) model.add("BMI ~ 1", random="age_grp|BP", categorical=["age_grp"]) model.build(backend="pymc") terms = model._match_derived_terms("age_grp|BP") names = set([t.name for t in terms]) assert names == {"1|BP", "age_grp[T.1]|BP", "age_grp[T.2]|BP"} term = model._match_derived_terms("1|BP")[0] assert term.name == "1|BP" # All of these should find nothing assert model._match_derived_terms("1|ZZZ") is None assert model._match_derived_terms("ZZZ|BP") is None assert model._match_derived_terms("BP") is None assert model._match_derived_terms("BP") is None
def test_3x4_fixed_anova(crossed_data): # add a four-level category that's perfectly crossed with threecats crossed_data["fourcats"] = sum([[x] * 10 for x in ["a", "b", "c", "d"]], list()) * 3 # using fit, with intercept model0 = Model(crossed_data) model0.fit("Y ~ threecats*fourcats", run=False) model0.build(backend="pymc3") fitted0 = model0.fit(tune=0, samples=1, init=None) assert len(fitted0.posterior.data_vars) == 5 # using fit, without intercept (i.e., 2-factor cell means model) model1 = Model(crossed_data) model1.fit("Y ~ 0 + threecats*fourcats", run=False) model1.build(backend="pymc3") fitted1 = model1.fit(tune=0, samples=1) assert len(fitted1.posterior.data_vars) == 4
def test_empty_model(crossed_data): # using formula model0 = Model(crossed_data) model0.add_y('Y') model0.build() model0.fit(samples=1) # using add_term model1 = Model(crossed_data) model1.fit('Y ~ 0', run=False) model1.build() model1.fit(samples=1) # check that add_formula and add_term models have same priors for fixed effects priors0 = {x.name:x.prior.args for x in model0.terms.values() if not x.random} priors1 = {x.name:x.prior.args for x in model1.terms.values() if not x.random} assert set(priors0) == set(priors1)
def test_nan_handling(crossed_data): data = crossed_data.copy() # Should fail because predictor has NaN model_fail_na = Model(crossed_data) model_fail_na.fit('Y ~ continuous', run=False) model_fail_na.terms['continuous'].data[[4, 6, 8], :] = np.nan with pytest.raises(ValueError): model_fail_na.build(backend='pymc3') # Should drop 3 rows with warning model_drop_na = Model(crossed_data, dropna=True) model_drop_na.fit('Y ~ continuous', run=False) model_drop_na.terms['continuous'].data[[4, 6, 8], :] = np.nan with pytest.warns(UserWarning) as w: model_drop_na.build(backend='pymc3') assert '3 rows' in w[0].message.args[0]
def test_distribute_group_specific_effect_over(diabetes_data): # 163 unique levels of BMI in diabetes_data # With intercept model = Model("BP ~ (C(age_grp)|BMI)", diabetes_data) model.build() # Since intercept is present, it uses treatment encoding lvls = sorted(list(diabetes_data["age_grp"].unique()))[1:] for lvl in lvls: assert model.terms[f"C(age_grp)[{lvl}]|BMI"].data.shape == (442, 163) assert "1|BMI" in model.terms # Without intercept model = Model("BP ~ (0 + C(age_grp)|BMI)", diabetes_data) model.build() assert model.terms["C(age_grp)[0]|BMI"].data.shape == (442, 163) assert model.terms["C(age_grp)[1]|BMI"].data.shape == (442, 163) assert model.terms["C(age_grp)[2]|BMI"].data.shape == (442, 163) assert not "1|BMI" in model.terms