def fit(results, components): """Basic OLS model to predict test results.""" X = np.zeros((len(results),26+len(components)*2)) Y = np.zeros((len(results),1)) for i,result in enumerate(results): Y[i] = result.correct x = np.zeros(len(components)) test_components = result.test.double.components+result.test.single.components for component in test_components: try: index = components.index(component) except ValueError: print "Couldn't find %s" % component sys.exit(0) else: x[index] += 1 for j in range(len(components)): X[i,26+2*j] = x[j]==1 X[i,26+2*j+1] = x[j]==2 X[i,result.subject_id] = 1 from scikits.statsmodels.api import OLS from sklearn import linear_model as lm clf = lm.Lasso(0.001)#alpha = alpha) results = clf.fit(X,Y) for i,beta in enumerate(results.coef_): result = round(1000*beta) if abs(result) > 0.1: print i,result print Y.shape,X.shape model = OLS(Y,X) results = model.fit() print results.summary() return Y,X
def typeIII(response, ancova, recarray): """ Produce an ANCOVA table with type III sum of squares from a given ANCOVA formula. Inputs ------ response: str field name of response in recarray ancova: ANCOVA specifies the model to be fit recarray: np.ndarray should contain all field names in the terms of ancova as well as response """ X = ancova.formula.design(recarray, return_float=True) Y = recarray[response] model = OLS(Y, X) results = model.fit() SSE_F = np.sum(results.resid**2) df_F = results.df_resid names = [] fs = [] dfs = [] sss = [] pvals = [] for contrast in ancova.contrast_names: r = results.f_test(ancova.contrast_matrices[contrast]) names.append(contrast) fs.append(r.fvalue) dfs.append(r.df_num) pvals.append(r.pvalue) sss.append(r.fvalue * results.scale * r.df_num) # Add in the "residual row" sss.append(SSE_F) dfs.append(df_F) pvals.append(np.nan) fs.append(np.nan) names.append('Residuals') result = np.array( names, np.dtype([('contrast', 'S%d' % max([len(n) for n in names]))])) result = ML.rec_append_fields( result, ['SS', 'df', 'MS', 'F', 'p_value'], [sss, dfs, np.array(sss) / np.array(dfs), fs, pvals]) return result
def typeIII(response, ancova, recarray): """ Produce an ANCOVA table with type III sum of squares from a given ANCOVA formula. Inputs ------ response: str field name of response in recarray ancova: ANCOVA specifies the model to be fit recarray: np.ndarray should contain all field names in the terms of ancova as well as response """ X = ancova.formula.design(recarray, return_float=True) Y = recarray[response] model = OLS(Y, X) results = model.fit() SSE_F = np.sum(results.resid**2) df_F = results.df_resid names = [] fs = [] dfs = [] sss = [] pvals = [] for contrast in ancova.contrast_names: r = results.f_test(ancova.contrast_matrices[contrast]) names.append(contrast) fs.append(r.fvalue) dfs.append(r.df_num) pvals.append(r.pvalue) sss.append(r.fvalue * results.scale * r.df_num) # Add in the "residual row" sss.append(SSE_F) dfs.append(df_F) pvals.append(np.nan) fs.append(np.nan) names.append('Residuals') result = np.array(names, np.dtype([('contrast','S%d' % max([len(n) for n in names]))])) result = ML.rec_append_fields(result, ['SS', 'df', 'MS', 'F', 'p_value'], [sss, dfs, np.array(sss) / np.array(dfs), fs, pvals]) return result
def typeII(response, ancova, recarray): """ Produce an ANCOVA table from a given ANCOVA formula with type II sums of squares. Inputs ------ response: str field name of response in recarray ancova: ANCOVA specifies the model to be fit recarray: np.ndarray should contain all field names in the terms of ancova as well as response """ Y = recarray[response] X = ancova.formula.design(recarray, return_float=True) model = OLS(Y, X) results = model.fit() SSE_F = np.sum(results.resid**2) df_F = results.df_resid names = [] sss = [] fs = [] dfs = [] pvals = [] for name, expr_factors in zip(ancova.contrast_names, ancova.sequence()): expr, factors = expr_factors F = ancova.all_but_above(expr, factors) C = ancova.contrasts[name] XF, contrast_matrices = F.formula.design(recarray, contrasts={'C': C}) modelF = OLS(Y, XF) resultsF = modelF.fit() SSEF = np.sum(resultsF.resid**2) dfF = resultsF.df_resid ftest = resultsF.f_test(contrast_matrices['C']) SSER = SSEF + ftest.fvalue * ftest.df_num * (SSEF / dfF) dfR = dfF + ftest.df_num sss.append(SSER - SSEF) dfs.append(ftest.df_num) fs.append(((SSER - SSEF) / (dfR - dfF)) / (SSE_F / df_F)) pvals.append(f_dbn.sf(fs[-1], dfR - dfF, df_F)) names.append(name) # Add in the "residual row" sss.append(SSE_F) dfs.append(df_F) pvals.append(np.nan) fs.append(np.nan) names.append('Residuals') result = np.array( names, np.dtype([('contrast', 'S%d' % max([len(n) for n in names]))])) result = ML.rec_append_fields( result, ['SS', 'df', 'MS', 'F', 'p_value'], [sss, dfs, np.array(sss) / np.array(dfs), fs, pvals]) return result
def typeI(response, ancova, recarray): """ Produce an ANCOVA table from a given ANCOVA formula with type I sums of squares where the order is based on the order of terms in the contrast_names of ancova. Inputs ------ response: str field name of response in recarray ancova: ANCOVA specifies the model to be fit recarray: np.ndarray should contain all field names in the terms of ancova as well as response """ Y = recarray[response] X = ancova.formula.design(recarray, return_float=True) model = OLS(Y, X) results = model.fit() SSE_F = np.sum(results.resid**2) df_F = results.df_resid model = OLS(Y, ancova.formulae[0].design(recarray, return_float=True)) results = model.fit() SSE_old = np.sum(results.resid**2) df_old = results.df_resid names = [] sss = [] fs = [] dfs = [] pvals = [] names.append(ancova.contrast_names[0]) fs.append( ((np.sum(Y**2) - SSE_old) / (Y.shape[0] - df_old)) / (SSE_F / df_F)) sss.append((np.sum(Y**2) - SSE_old)) dfs.append(Y.shape[0] - df_old) pvals.append(f_dbn.sf(fs[-1], Y.shape[0] - df_old, df_F)) for d in range(1, len(ancova.formulae)): terms = [] for f in ancova.formulae[:(d + 1)]: terms += list(f.terms) # JT: this is not numerically efficient # could be done by updating some factorization of the full X X = Formula(terms).design(recarray, return_float=True) model = OLS(Y, X) results = model.fit() SSE_new = np.sum(results.resid**2) df_new = results.df_resid sss.append(SSE_old - SSE_new) dfs.append(df_old - df_new) fs.append(((SSE_old - SSE_new) / (df_old - df_new)) / (SSE_F / df_F)) pvals.append(f_dbn.sf(fs[-1], df_old - df_new, df_new)) names.append(ancova.contrast_names[d]) SSE_old = SSE_new df_old = df_new # Add in the "residual row" sss.append(SSE_new) dfs.append(df_new) pvals.append(np.nan) fs.append(np.nan) names.append('Residuals') result = np.array( names, np.dtype([('contrast', 'S%d' % max([len(n) for n in names]))])) result = ML.rec_append_fields( result, ['SS', 'df', 'MS', 'F', 'p_value'], [sss, dfs, np.array(sss) / np.array(dfs), fs, pvals]) return result
def typeII(response, ancova, recarray): """ Produce an ANCOVA table from a given ANCOVA formula with type II sums of squares. Inputs ------ response: str field name of response in recarray ancova: ANCOVA specifies the model to be fit recarray: np.ndarray should contain all field names in the terms of ancova as well as response """ Y = recarray[response] X = ancova.formula.design(recarray, return_float=True) model = OLS(Y, X) results = model.fit() SSE_F = np.sum(results.resid**2) df_F = results.df_resid names = [] sss = [] fs = [] dfs = [] pvals = [] for name, expr_factors in zip(ancova.contrast_names, ancova.sequence()): expr, factors = expr_factors F = ancova.all_but_above(expr, factors) C = ancova.contrasts[name] XF, contrast_matrices = F.formula.design(recarray, contrasts={'C':C}) modelF = OLS(Y, XF) resultsF = modelF.fit() SSEF = np.sum(resultsF.resid**2) dfF = resultsF.df_resid ftest = resultsF.f_test(contrast_matrices['C']) SSER = SSEF + ftest.fvalue * ftest.df_num * (SSEF / dfF) dfR = dfF + ftest.df_num sss.append(SSER - SSEF) dfs.append(ftest.df_num) fs.append(((SSER - SSEF) / (dfR - dfF)) / (SSE_F / df_F)) pvals.append(f_dbn.sf(fs[-1], dfR-dfF, df_F)) names.append(name) # Add in the "residual row" sss.append(SSE_F) dfs.append(df_F) pvals.append(np.nan) fs.append(np.nan) names.append('Residuals') result = np.array(names, np.dtype([('contrast','S%d' % max([len(n) for n in names]))])) result = ML.rec_append_fields(result, ['SS', 'df', 'MS', 'F', 'p_value'], [sss, dfs, np.array(sss) / np.array(dfs), fs, pvals]) return result
def typeI(response, ancova, recarray): """ Produce an ANCOVA table from a given ANCOVA formula with type I sums of squares where the order is based on the order of terms in the contrast_names of ancova. Inputs ------ response: str field name of response in recarray ancova: ANCOVA specifies the model to be fit recarray: np.ndarray should contain all field names in the terms of ancova as well as response """ Y = recarray[response] X = ancova.formula.design(recarray, return_float=True) model = OLS(Y, X) results = model.fit() SSE_F = np.sum(results.resid**2) df_F = results.df_resid model = OLS(Y, ancova.formulae[0].design(recarray, return_float=True)) results = model.fit() SSE_old = np.sum(results.resid**2) df_old = results.df_resid names = [] sss = [] fs = [] dfs = [] pvals = [] names.append(ancova.contrast_names[0]) fs.append(((np.sum(Y**2) - SSE_old) / (Y.shape[0] - df_old)) / (SSE_F / df_F)) sss.append((np.sum(Y**2) - SSE_old)) dfs.append(Y.shape[0] - df_old) pvals.append(f_dbn.sf(fs[-1], Y.shape[0]-df_old, df_F)) for d in range(1,len(ancova.formulae)): terms = [] for f in ancova.formulae[:(d+1)]: terms += list(f.terms) # JT: this is not numerically efficient # could be done by updating some factorization of the full X X = Formula(terms).design(recarray, return_float=True) model = OLS(Y, X) results = model.fit() SSE_new = np.sum(results.resid**2) df_new = results.df_resid sss.append(SSE_old - SSE_new) dfs.append(df_old - df_new) fs.append(((SSE_old-SSE_new) / (df_old - df_new)) / (SSE_F / df_F)) pvals.append(f_dbn.sf(fs[-1], df_old-df_new, df_new)) names.append(ancova.contrast_names[d]) SSE_old = SSE_new df_old = df_new # Add in the "residual row" sss.append(SSE_new) dfs.append(df_new) pvals.append(np.nan) fs.append(np.nan) names.append('Residuals') result = np.array(names, np.dtype([('contrast','S%d' % max([len(n) for n in names]))])) result = ML.rec_append_fields(result, ['SS', 'df', 'MS', 'F', 'p_value'], [sss, dfs, np.array(sss) / np.array(dfs), fs, pvals]) return result