def test_maxT(self): n = 100 px = 5 py_info = 2 py_noize = 100 beta = np.array([1, 0, -.5] + [0] * (px - 4) + [2]).reshape((px, 1)) np.random.seed(42) X = np.hstack([np.random.randn(n, px - 1), np.ones((n, 1))]) Y = np.random.randn(n, py_info + py_noize) # Causal model: add X on the first py_info variable Y[:, :py_info] += np.dot(X, beta) contrasts = np.identity(X.shape[1]) mod = mulm.MUOLS(Y, X).fit() tvals, rawp, df = mod.t_test(contrasts, pval=True, two_tailed=True) tvals2, maxT, df2 = mod.t_test_maxT(contrasts, two_tailed=True) assert np.all(tvals == tvals2) assert np.all(df == df2) mod_block = mulm.MUOLS(Y, X).fit(block=True, max_elements=1000) tvals_block, rawp_block, df_block = mod.t_test(contrasts, pval=True, two_tailed=True) tvals_block2, maxT_block, df_block2 = mod_block.t_test_maxT( contrasts, two_tailed=True) assert_allclose(tvals_block, tvals_block2) assert_allclose(tvals_block2, tvals2) assert_allclose(df_block, df_block2) assert_allclose(df_block2, df2) # More than 10 positive with uncorrected pval expected_tp = py_info * 3 expected_fp = ((py_info + py_noize) * 5 - expected_tp) * 0.05 expected_p = expected_tp + expected_fp # Test the number of rawp positive lie within a expected positive +-10 assert (np.sum(rawp < 0.05) < (expected_p + 10)) and (np.sum(rawp < 0.05) > (expected_p - 10)) assert (np.sum(rawp_block < 0.05) < (expected_p + 10)) and (np.sum(rawp_block < 0.05) > (expected_p - 10)) # Test the number maxT positive lie within a expected true positive +-2 assert np.sum(maxT < 0.05) < (expected_tp + 2) and np.sum( maxT < 0.05) > (expected_tp - 2) assert np.sum(maxT_block < 0.05) < (expected_tp + 2) and np.sum( maxT_block < 0.05) > (expected_tp - 2)
def univar_stats(Y, X, path_prefix, mask_img): contrasts = [1] + [0] * (X.shape[1] - 1) mod = mulm.MUOLS(Y, X) tvals, pvals, df = mod.fit().t_test(contrasts, pval=True, two_tailed=True) print([[thres, np.sum(pvals < thres), np.sum(pvals < thres) / pvals.size] for thres in 10.**np.array([-4, -3, -2])]) # {'voxsize': 1.5, 'smoothing': 0, 'target': 'dx_num'} # [[0.0001, 23068, 0.058190514149063371], [0.001, 47415, 0.11960738808643315], [0.01, 96295, 0.24291033292804132]] tstat_arr = np.zeros(mask_arr.shape) pvals_arr = np.zeros(mask_arr.shape) pvals_arr[mask_arr] = -np.log10(pvals[0]) tstat_arr[mask_arr] = tvals[0] pvals_img = nibabel.Nifti1Image(pvals_arr, affine=mask_img.affine) pvals_img.to_filename(path_prefix + "_log10pvals.nii.gz") tstat_img = nibabel.Nifti1Image(tstat_arr, affine=mask_img.affine) tstat_img.to_filename(path_prefix + "_tstat.nii.gz") threshold = 3 fig = plt.figure(figsize=(13.33, 7.5 * 4)) ax = fig.add_subplot(411) ax.set_title("-log pvalues >%.2f" % threshold) plotting.plot_glass_brain(pvals_img, threshold=threshold, figure=fig, axes=ax) ax = fig.add_subplot(412) ax.set_title("T-stats T>%.2f" % threshold) plotting.plot_glass_brain(tstat_img, threshold=threshold, figure=fig, axes=ax) ax = fig.add_subplot(413) ax.set_title("-log pvalues >%.2f" % threshold) plotting.plot_stat_map(pvals_img, colorbar=True, draw_cross=False, threshold=threshold, figure=fig, axes=ax) ax = fig.add_subplot(414) ax.set_title("T-stats T>%.2f" % threshold) plotting.plot_stat_map(tstat_img, colorbar=True, draw_cross=False, threshold=threshold, figure=fig, axes=ax) plt.savefig(path_prefix + "_tstat.png") return tstat_arr, pvals_arr
def residualize(Y, formula_res, data, formula_full=None): """ Residualisation of adjusted residualization. Parameters ---------- Y: array (n, p), dependant variables formula_res: str, residualisation formula ex: "site": 1) Fit Y = b0 + b1 site + eps 2) Return Y - b0 - b1 site data: DataFrame of independant variables formula_full: str, full model formula (default None) ex: "age + sex + site + diagnosis". If not Null residualize performs an adjusted residualization: 1) Fit Y = b1 age + b2 sex + b3 site + b4 diagnosis + eps 2) Return Y - b3 site Returns ------- Y: array (n, p), of residualized dependant variables """ if formula_full is None: formula_full = formula_res res_terms = mulm.design_matrix(formula=formula_res, data=data)[1].keys() X, t_contrasts, f_contrasts = mulm.design_matrix(formula=formula_full, data=data) # Fit full model mod_mulm = mulm.MUOLS(Y, X).fit() # mask of terms in residualize formula within full model mask = np.array([cont for term, cont in t_contrasts.items() if term in res_terms]).sum(axis=0) == 1 return Y - np.dot(X[:, mask], mod_mulm.coef[mask, :])
def test_ttest(self): n = 100 px = 5 py_info = 2 py_noize = 100 beta = np.array([1, 0, -.5] + [0] * (px - 4) + [2]).reshape((px, 1)) X = np.hstack([np.random.randn(n, px - 1), np.ones((n, 1))]) # X with intercept Y = np.random.randn(n, py_info + py_noize) # Causal model: add X on the first py_info variable Y[:, :py_info] += np.dot(X, beta) # Two-tailed t-test all the regressors contrasts = np.identity(X.shape[1]) ## OLS with statmodels, need to iterate over Y columns sm_tvals = list() sm_pvals = list() for j in range(Y.shape[1]): mod = sm.OLS(Y[:, j], X) sm_ttest = mod.fit().t_test(contrasts) sm_tvals.append(sm_ttest.tvalue) sm_pvals.append(sm_ttest.pvalue) sm_tvals = np.asarray(sm_tvals).T sm_pvals = np.asarray(sm_pvals).T ## OLS with MULM two-tailed mod = mulm.MUOLS(Y, X).fit() mulm_tvals, mulm_pvals, mulm_df = mod.t_test(contrasts, pval=True, two_tailed=True) mod_block = mulm.MUOLS(Y, X).fit(block=True, max_elements=1000) mulm_tvals_block, mulm_pvals_block, mulm_df_block = mod_block.t_test( contrasts, pval=True, two_tailed=True) # Check that results are similar assert_almost_equal(mulm_tvals, sm_tvals) assert_almost_equal(mulm_pvals, sm_pvals) assert_allclose(mulm_tvals, mulm_tvals_block) assert_allclose(mulm_pvals, mulm_pvals_block) assert_allclose(mulm_df, mulm_df_block)
def fit(self, Y, design_mat): """ Y: array (n, p) Dependant variables design_mat: array(n, k) Design matrix of independant variables """ assert Y.shape[0] == design_mat.shape[0] assert self.mask.shape[0] == design_mat.shape[1] self.mod_mulm = mulm.MUOLS(Y, design_mat).fit() return self
def univ_stats(Y, formula, data): """ Parameters ---------- Y: array (n_subjects, n_features) formula: str eg. "age + sex + site" data: DataFrame, containing value of formula terms """ X, t_contrasts, f_contrasts = mulm.design_matrix(formula=formula, data=data) mod_mulm = mulm.MUOLS(Y, X).fit() aov_mulm = OrderedDict((term, mod_mulm.f_test(f_contrasts[term], pval=True)) for term in f_contrasts) return mod_mulm, aov_mulm
def test_ttest_ftest_vs_statsmodels(self): url = 'https://github.com/duchesnay/pystatsml/raw/master/datasets/salary_table.csv' df = pd.read_csv(url) # Fit with statmodel oneway = smfrmla.ols('salary ~ experience + education + management', df).fit() aov = sm.stats.anova_lm(oneway, typ=2) # Type 2 ANOVA DataFrame # Fit with MULM X_df = pd.get_dummies(df.iloc[:, 1:]) X = np.asarray(X_df).astype(float) Y = np.asarray(df.salary)[:, None].astype(float) con_exp = np.zeros((X.shape[1], X.shape[1])) con_exp[0, 0] = 1 con_edu = np.zeros((X.shape[1], X.shape[1])) con_edu[[1, 2, 3], [1, 2, 3]] = 1 con_man = np.zeros((X.shape[1], X.shape[1])) con_man[[4, 5], [4, 5]] = 1 import mulm mod = mulm.MUOLS(Y, X).fit() tvals_exp, rawp_expt, df = mod.t_test([1, 0, 0, 0, 0, 0], pval=True, two_tailed=True) fvals_exp, rawp_exp, df = mod.f_test(con_exp, pval=True) fvals_edu, rawp_edu, df = mod.f_test(con_edu, pval=True) fvals_man, rawp_man, df = mod.f_test(con_man, pval=True) assert np.allclose(aov.loc['experience', 'F'], tvals_exp[0]**2) assert np.allclose(aov.loc['experience', 'PR(>F)'], rawp_expt[0]**2) assert np.allclose(aov.loc['experience', 'F'], fvals_exp[0]) assert np.allclose(aov.loc['experience', 'PR(>F)'], rawp_exp[0]) assert np.allclose(aov.loc['education', 'F'], fvals_edu[0]) assert np.allclose(aov.loc['education', 'PR(>F)'], rawp_edu[0]) assert np.allclose(aov.loc['management', 'F'], fvals_man[0]) assert np.allclose(aov.loc['management', 'PR(>F)'], rawp_man[0])
def fit(self, Y, X): """Fit parameters of p linear models where each Y is regressed on X. Parameters ---------- Y: array (n, p) Dependant variables X: array(n, k) Design matrix of independant variables """ if self.contrast_res is None: self.contrast_res = np.ones(X.shape[1]).astype(bool) assert Y.shape[0] == X.shape[0] assert self.contrast_res.shape[0] == X.shape[ 1], "contrast doesn't match design matrix" self.mod_mulm = mulm.MUOLS(Y, X).fit() return self
snp = snps[:, (m-1):(m+1)] X = np.hstack((snp, cov_util)) #STOP #to interact with the interpreter #MUOLS s_map = np.zeros(images.shape[1]) p_map = np.zeros(images.shape[1]) debut = range(0, images.shape[1], 10000) fin = debut + [images.shape[1]] fin = fin[1:] for d, f in zip(debut, fin): print d,f bigols = mulm.MUOLS() bigols.fit(X, images[:, d:f]) contrast = [0.,1.,0.,0.] # contrast = [0.,1.,0.,0.,0.,0.,0.,0.,0.,0.,0.,0.] s, p = bigols.stats_t_coefficients(X, images[:,d:f], contrast, pval=True) s_map[d:f] = s[:] p_map[d:f] = p[:] template_for_size = os.path.join(PROJECT_DIR, '2013_imagen_bmi', 'data', 'mask', 'mask.nii') template_for_size_img = ni.load(template_for_size) image = np.zeros(template_for_size_img.get_data().shape) image[masked_data_index] = s_map pn = os.path.join(PROJECT_DIR, 'documents', '2014jan24_Plink', 'bbox_stats_7182018_covGenderPDS.nii.gz')
#Residuals: # Min 1Q Median 3Q Max #-2.7101 -0.4352 0.0232 0.5453 2.1318 # #Coefficients: #Estimate Std. Error t value Pr(>|t|) #(Intercept) -0.01548 0.10158 -0.152 0.879 #X[, 1] 0.01158 0.067840.171 0.865 # #Residual standard error: 0.8629 on 362 degrees of freedom #Multiple R-squared: 8.051e-05, Adjusted R-squared: -0.002682 #F-statistic: 0.02915 on 1 and 362 DF, p-value: 0.8645 for k in snp.keys(): print '\n====SNP : ', k x = X[:, snp[k]].reshape((n, -1)) #transcoding should be performed to be compliant to R convention x[x == 2] = 3 x[x == 0] = 2 x[x == 3] = 0 #intercept x = np.hstack((x, np.ones((x.shape[0], 1)))) x[:, -1] = 1 olser = mulm.MUOLS() olser.fit(x, y) betas = olser.coef_ contrast = [1., 0.] t, p = olser.stats_t_coefficients(x, y, contrast, pval=True) s, p = olser.stats_f_coefficients(x, y, contrast, pval=True) print 'betas = ', betas, '\n\n' print 'stat-t, stat-f, p-val model additif= ', t, s, p
# Read mask mask_img = nibabel.load(os.path.join(WD, "data", "mni_cerebrum-mask.nii.gz")) mask_arr = mask_img.get_fdata() != 0 # Apply mask X_train = imgs_train[:, mask_arr] X_test = imgs_test[:, mask_arr] ################################################################################ # Univariate statistics # --------------------- Z_train, t_contrasts, f_contrasts = mulm.design_matrix(formula="sex + age", data=demo_train) mod_mulm = mulm.MUOLS(Y=X_train, X=Z_train).fit() def flat_to_img(mask_img, flat_values): val_arr = np.zeros(mask_img.get_fdata().shape) val_arr[mask_img.get_fdata() != 0] = flat_values.squeeze() return nilearn.image.new_img_like(mask_img, val_arr) tstat_sex, pval_sex, df_sex = mod_mulm.t_test(t_contrasts['sex'], pval=True) tstat_sex_img = flat_to_img(mask_img, tstat_sex.squeeze()) nilearn.plotting.plot_stat_map(tstat_sex_img, title="sex") tstat_age, pval_age, df_age = mod_mulm.t_test(t_contrasts['age'], pval=True) tstat_age_img = flat_to_img(mask_img, tstat_age.squeeze()) nilearn.plotting.plot_stat_map(tstat_age_img, title="age")
# 1st model MODEL = ["Gender", "Age", "VSF", "Scanner_Type"] design_mat = utils.make_design_matrix(df, regressors=MODEL).as_matrix() isnan = numpy.isnan(design_mat) if isnan.any(): bad_subject_ind = numpy.where(isnan)[0] print "Removing subject", bad_subject_ind design_mat = numpy.delete(design_mat, bad_subject_ind, axis=0) images = numpy.delete(images, bad_subject_ind, axis=0) # Fit LM & compute residuals lm = mulm.MUOLS() lm.fit(X=design_mat, Y=images) images_pred = lm.predict(X=design_mat) res = images - images_pred # Write to file residual_name = 'masked_images_' + '_'.join(MODEL) print "Writing images to", residual_name data_api.write_images(h5file, res, residual_name) # 2nd model MODEL = ["Gender", "Age", "VSF", "ImagingCentreCity"] design_mat = utils.make_design_matrix(df, regressors=MODEL).as_matrix()
Credit: E Duchesnay """ import numpy as np import mulm import pylab as plt n = 100 px = 5 py_info = 2 py_noize = 100 beta = np.array([1, 0, .5] + [0] * (px - 4) + [2]).reshape((px, 1)) X = np.hstack([np.random.randn(n, px-1), np.ones((n, 1))]) # X with intercept Y = np.random.randn(n, py_info + py_noize) # Causal model: add X on the first py_info variable Y[:, :py_info] += np.dot(X, beta) # t-test all the regressors (by default mulm and sm do two-tailed tests) contrasts = np.identity(X.shape[1]) mod = mulm.MUOLS(Y, X) tvals, rawp, df = mod.fit().t_test(contrasts, pval=True, two_tailed=True) tvals, maxT, df2 = mod.t_test_maxT(contrasts, two_tailed=True) tvals3, minP, df3 = mod.t_test_minP(contrasts, two_tailed=True) n, bins, patches = plt.hist([rawp[0,:], maxT[0,:], minP[0,:]], color=['blue', 'red', 'green'], label=['rawp','maxT', 'minP']) plt.legend() plt.show()
]], pd.get_dummies(pop_treat_ses01[['site']]) ], axis=1) print(Zdf.isnull().sum()) Zdf.loc[Zdf["age_onset"].isnull(), "age_onset"] = Zdf["age_onset"].mean() print(Zdf.isnull().sum()) Z = np.asarray(Zdf) ## OLS with MULM contrasts = [1] + [0] * (Zdf.shape[1] - 1) mod = mulm.MUOLS(XTreat, Z) tvals, pvals, df = mod.fit().t_test(contrasts, pval=True, two_tailed=True) print([[thres, np.sum(pvals < thres), np.sum(pvals < thres) / pvals.size] for thres in 10.**np.array([-4, -3, -2])]) # [[0.0001, 34, 8.5521897378753849e-05], [0.001, 333, 0.0008376115243272068], [0.01, 3374, 0.0084867906398798671]] tstat_arr = np.zeros(mask_arr.shape) pvals_arr = np.zeros(mask_arr.shape) pvals_arr[mask_arr] = -np.log10(pvals[0]) tstat_arr[mask_arr] = tvals[0] pvals_img = nibabel.Nifti1Image(pvals_arr, affine=mask_img.affine)
################################################################################ # Example 1: Salary dataset # ------------------------- # # Fit model a single model: `salary ~ experience + education + management` url = 'https://github.com/duchesnay/pystatsml/raw/master/datasets/salary_table.csv' df = pd.read_csv(url) ################################################################################ # Fit with MULM Y = np.asarray(df.salary)[:, None].astype(float) X, t_contrasts, f_contrasts = mulm.design_matrix(formula="experience + education + management", data=df) mod_mulm = mulm.MUOLS(Y, X).fit() tstat_mulm = OrderedDict((term, mod_mulm.t_test(t_contrasts[term], pval=True)) for term in t_contrasts) fstat_mulm = OrderedDict((term, mod_mulm.f_test(f_contrasts[term], pval=True)) for term in f_contrasts) print(mod_mulm.coef) print(pd.DataFrame(tstat_mulm, index=['tstat', 'pval', 'df']).T) # print(pd.DataFrame(fstat_mulm, index=['fstat', 'pval']).T) ################################################################################ # Fit with statsmodel mod_sm = smfrmla.ols('salary ~ experience + education + management', df).fit() print(mod_sm.summary()) fstat_sm = sm.stats.anova_lm(mod_sm, typ=2) # Type 2 ANOVA DataFrame