def test_statsmodel_to_df(): amplitude = 1.432 df_cha = pd.DataFrame() for n in range(5): raw = simulate_nirs_raw(sfreq=3., amplitude=amplitude, sig_dur=300., stim_dur=5., isi_min=15., isi_max=45.) design_matrix = make_first_level_design_matrix(raw, stim_dur=5.0) glm_est = run_GLM(raw, design_matrix) cha = glm_to_tidy(raw, glm_est, design_matrix) cha["ID"] = '%02d' % n df_cha = df_cha.append(cha) df_cha["theta"] = df_cha["theta"] * 1.0e6 roi_model = smf.mixedlm("theta ~ -1 + Condition", df_cha, groups=df_cha["ID"]).fit(method='nm') df = statsmodels_to_results(roi_model) assert type(df) == pd.DataFrame assert df["Coef."]["Condition[A]"] == amplitude assert df["Significant"]["Condition[A]"] assert df.shape == (8, 8) roi_model = smf.rlm("theta ~ -1 + Condition", df_cha, groups=df_cha["ID"]).fit() df = statsmodels_to_results(roi_model) assert type(df) == pd.DataFrame assert df["Coef."]["Condition[A]"] == amplitude assert df["Significant"]["Condition[A]"] assert df.shape == (8, 8)
def linearRegression(segmentedValues): print("Linear regression") #regression = LinearRegression() linRegress = dict() for key in segmentedValues.keys(): x = [x[0] for x in segmentedValues[key]] y = [x[1] for x in segmentedValues[key]] mean = [float(np.average(x)),float(np.average(y))] valuesDict = dict() valuesDict['x'] = x valuesDict['y'] = y valuesFrame = pd.DataFrame(valuesDict) try: rlmRes = sm.rlm(formula = 'y ~ x', data=valuesFrame).fit() except ZeroDivisionError: #I have no idea why this occurs. A problem with statsmodel #Return None print("divide by zero :( ") return None #Caclulate r2_score (unfortunately, rlm does not give this to us) x = np.array(x) y = np.array(y) #Get the predicted values of Y y_pred = x*rlmRes.params.x+rlmRes.params.Intercept score = r2_score(y, y_pred) #These should both be positive -- put in abs anyway slopeConfInterval = abs(float(rlmRes.params.x) - float(rlmRes.conf_int(.005)[0].x)) intConfInterval = abs(float(rlmRes.params.Intercept) - float(rlmRes.conf_int(.005)[0].Intercept)) #Slope, Intercept, R^2, num of values, confidenceIntervals, mean of cluster linRegress[key] = [rlmRes.params.x, rlmRes.params.Intercept, score, len(x), [slopeConfInterval, intConfInterval], mean] print("Key: "+str(key)+" Slope: "+str(rlmRes.params.x)+" Intercept: "+str(rlmRes.params.Intercept)+"R2 Score: "+str(score)+" Num vals: "+str(len(x))+" confidence: "+str(slopeConfInterval)+", "+str(intConfInterval)+" mean: "+str(mean)) return linRegress
def report_rlm(formula, data, verbose=True, **kwargs): """Fit RLM, print a report, and return the fit object.""" results = smf.rlm(formula, data=data, **kwargs).fit(**kwargs) summary = results.summary() if verbose: report = """\n{summary}\n""".format(summary=summary) print(report) return results
def rolling_ols(formula: str, data: pd.DataFrame, window: int, r2_adj=False, expanding=False, robust=False, M=sm.robust.norms.AndrewWave()): para_res = {} r_2_res = {} model_sig = {} forcast_res = pd.Series([]) for i in range(len(data) - window + 1): if expanding: start_index = 0 else: start_index = i tmp_df = data.iloc[start_index:i + window] forcast_x = data.iloc[i + window:i + window + 1] if robust: rlm_model = smf.rlm(formula, data=tmp_df, M=M) ols_result = smf.wls(formula, data=tmp_df, weights=rlm_model.fit().weights).fit() # ols_result = sm.WLS(rlm_model.endog, rlm_model.exog, # weights=rlm_model.fit().weights).fit() else: ols_result = smf.ols(formula, data=tmp_df).fit() para_res[data.index[i + window - 1]] = ols_result.params model_sig[data.index[i + window - 1]] = ols_result.f_pvalue if r2_adj: r_2_res[data.index[i + window - 1]] = ols_result.rsquared_adj else: r_2_res[data.index[i + window - 1]] = ols_result.rsquared # 一步预测 forcast_res = forcast_res.append(ols_result.predict(forcast_x)) para_res = pd.DataFrame(para_res).T r_2_res = pd.Series(r_2_res) model_sig = pd.Series(model_sig) return para_res, r_2_res.mean(), model_sig, forcast_res
def view_Analysis(model_type: model_type, headers_dependent: headers_dependent, headers_factor: headers_factor, headers_groups: headers_groups, analysis_formula: analysis_formula): data = df mdl_string = 'noInput' if analysis_formula != '': mdl_string = analysis_formula else: if headers_dependent != 'Select' and headers_factor != 'Select': mdl_string = headers_dependent + ' ~ ' + headers_factor if mdl_string != 'noInput': if analysis_formula != '': mdl_string = analysis_formula else: mdl_string = headers_dependent + ' ~ ' + headers_factor if model_type == 'Ordinary Least Squares': model = ols(mdl_string, data).fit() elif model_type == 'Generalized Linear Models': model = glm(mdl_string, data, family=sm.families.Gamma()).fit() elif model_type == 'Robust Linear Models': model = rlm(mdl_string, data, M=sm.robust.norms.HuberT()).fit() elif model_type == 'Linear Mixed Effects Models': if headers_groups != 'Select': model = mixedlm(mdl_string, data, groups=data[headers_groups]).fit() elif model_type == 'Discrete - Regression with binary - Logit': model = Logit(data[headers_dependent], data[headers_factor].astype(float)).fit() elif model_type == 'Discrete - Regression with binary - Probit': model = Probit(data[headers_dependent], data[headers_factor].astype(float)).fit() elif model_type == 'Discrete - Regression with nominal - MNLogit': y = data[headers_factor] x = sm.add_constant(data[headers_dependent], prepend=False) model = sm.MNLogit(y, x).fit() elif model_type == 'Discrete - Regression with count - Poisson': model = Poisson(data[headers_dependent], data[headers_factor].astype(float)).fit() display(model.summary())
def runModel(experiment, data, dependentVariable, independentVariables, regressionType='ols'): import statsmodels.formula.api as smf modelStr = modelString(experiment, dependentVariable, independentVariables) if regressionType == 'ols': model = smf.ols(modelStr, data=data) elif regressionType == 'gls': model = smf.gls(modelStr, data=data) elif regressionType == 'rlm': model = smf.rlm(modelStr, data=data) else: print('Unknown regression type {}. Exiting'.format(regressionType)) import sys sys.exit() return model.fit()
def rlm_formula(data, xseq, **params): """ Fit RLM using a formula """ eval_env = params['enviroment'] formula = params['formula'] init_kwargs, fit_kwargs = separate_method_kwargs(params['method_args'], sm.RLM, sm.RLM.fit) model = smf.rlm(formula, data, eval_env=eval_env, **init_kwargs) results = model.fit(**fit_kwargs) data = pd.DataFrame({'x': xseq}) data['y'] = results.predict(data) if params['se']: warnings.warn( "Confidence intervals are not yet implemented" "for RLM smoothing.", PlotnineWarning) return data
def test_statsmodel_to_df(func): func = getattr(smf, func) np.random.seed(0) amplitude = 1.432 df_cha = pd.DataFrame() for n in range(5): raw = simulate_nirs_raw(sfreq=3., amplitude=amplitude, sig_dur=300., stim_dur=5., isi_min=15., isi_max=45.) raw._data += np.random.normal(0, np.sqrt(1e-12), raw._data.shape) design_matrix = make_first_level_design_matrix(raw, stim_dur=5.0) glm_est = run_glm(raw, design_matrix) with pytest.warns(RuntimeWarning, match='Non standard source detect'): cha = glm_est.to_dataframe() cha["ID"] = '%02d' % n df_cha = pd.concat([df_cha, cha], ignore_index=True) df_cha["theta"] = df_cha["theta"] * 1.0e6 roi_model = func("theta ~ -1 + Condition", df_cha, groups=df_cha["ID"]).fit() df = statsmodels_to_results(roi_model) assert type(df) == pd.DataFrame assert_allclose(df["Coef."]["Condition[A]"], amplitude, rtol=0.1) assert df["Significant"]["Condition[A]"] assert df.shape == (8, 8) roi_model = smf.rlm("theta ~ -1 + Condition", df_cha, groups=df_cha["ID"]).fit() df = statsmodels_to_results(roi_model) assert type(df) == pd.DataFrame assert_allclose(df["Coef."]["Condition[A]"], amplitude, rtol=0.1) assert df["Significant"]["Condition[A]"] assert df.shape == (8, 8)
#### Influence Plot fig, ax = plt.subplots(figsize=(8, 6)) fig = sm.graphics.influence_plot(crime_model, ax=ax) #### Using robust regression to correct for outliers. # Part of the problem here in recreating the Stata results is that M-estimators are not robust to leverage points. MM-estimators should do better with this examples. from statsmodels.formula.api import rlm rob_crime_model = rlm("murder ~ urban + poverty + hs_grad + single", data=dta, M=sm.robust.norms.TukeyBiweight(3)).fit( conv="weights" ) print rob_crime_model.summary() # rob_crime_model = rlm("murder ~ pctmetro + poverty + pcths + single", data=dta, M=sm.robust.norms.TukeyBiweight()).fit(conv="weights") # print rob_crime_model.summary() # There aren't yet an influence diagnostics as part of RLM, but we can recreate them. (This depends on the status of [issue #888](https://github.com/statsmodels/statsmodels/issues/808)) weights = rob_crime_model.weights idx = weights > 0 X = rob_crime_model.model.exog[idx] ww = weights[idx] / weights[idx].mean() hat_matrix_diag = ww * (X * np.linalg.pinv(X).T).sum(1)
if not output[group1][yr][outcome]: continue x_means.append(yr) for i in range(len(output[group1][yr][outcome])): x.append(yr) y.append(output[group2][yr][outcome][i]-output[group1][yr][outcome][i]) y_means.append(np.mean(y)) ''' ''' errorbar(output.index, yMeans, yerr=yErr, fmt='o-') result = smf.ols(data=pd.DataFrame({'y':Ys, 'x':Xs}), formula='y~x').fit() #result = smf.OLS(y_means,smt.add_constant(x_means)).fit() plot(years, np.array(years)*result.params[1] + result.params[0], 'r--') print 'slope:'+str(result.params[1])+', '+str(result.pvalues[1]) ''' title(outcome) xlabel('Year article was published') # plot(x, y, 'x', x_means, y_means, 'o') plot(output.index, yMeans, 'o', alpha=0.9) plot(Xs, Ys, 'x', alpha=0.7) result = smf.rlm(data=pd.DataFrame({'y':Ys, 'x':Xs}), formula='y~x').fit() #result = smf.OLS(y_means,smt.add_constant(x_means)).fit() plot(years, np.array(years)*result.params[1] + result.params[0], 'r--') figtext(0.6, 0.8, 'slope:'+str(np.around(result.params[1],4))+', p='+str(np.around(result.pvalues[1],3))) figtext(0.55, 0.75, 'green=raw data, blue=means') print 'intercept:'+str(result.params[0])+', '+str(result.pvalues[0]) show() #raw_input('..')
def test_significance(df, dependent_var, *independent_vars, formula=None, logit_model=False, correction_method='bonf', anova_type=2): """ Test the significance of independent vars on the dependent var and output the complete results of each step. This doesn't let us tune as many parameters as we might want to. (Don't use this generally) Args: df: DataFrame dependent_var: The name of the dependent variable column in df independent_vars: Array of independent variable columns in df formula (str): A formula relating the vars. If not specified, no interactions are assumed Returns: output (str) : A string to print the results of each test results (dict) : A dictionary of results corresponding to each test """ ALPHA = 0.05 # Used for diagnostic tests output = '' results = { 'multicollinearity': False, 'homoskedastic': True, 'normal_distribution': True, } # First add the summary data summary_df = rp.summary_cont( df.groupby(list(independent_vars))[dependent_var]) summary_df['median'] = df.groupby( list(independent_vars))[dependent_var].median() output += f'Summary:\n{summary_df}\n\n' results['summary'] = summary_df # Get the OLS model formula if formula is None: formula = f"{dependent_var} ~ {' + '.join([f'C({v})' for v in independent_vars])} " # Then create the model and fit the data if not logit_model: model = smapi.ols(formula, data=df) else: # model = smapi.logit(formula, data=df) model = smapi.glm(formula, data=df, family=sm.families.Binomial()) model_results = model.fit() output += f"{model_results.summary()}\n\n" results['initial'] = model_results # Check for normality if not logit_model: w, pvalue = spstats.shapiro(model_results.resid) output += f'Shapiro-Wilk test: {w, pvalue}\n\n' results['shapiro'] = ( w, pvalue, ) # if pvalue < 1e-4: if pvalue < ALPHA: output += 'NON NORMAL detected. Do something else\n\n' results['normal_distribution'] = False # Check for homoskedasticity based on the normality test if not logit_model: unique_values = df.groupby( list(independent_vars)).size().reset_index().rename( columns={0: 'count'}) hs_test_data = [] for row in unique_values.itertuples(index=False): if len(independent_vars) > 1: selectors = [(df[v] == getattr(row, v)) for v in independent_vars] row_selector = np.logical_and(*selectors[:2]) if len(independent_vars) > 2: row_selector = np.logical_and(row_selector, selectors[2]) else: v = independent_vars[0] row_selector = df[v] == getattr(row, v) hs_test_data.append(df.loc[row_selector, dependent_var]) assert len(hs_test_data) == unique_values.shape[0] if results['normal_distribution']: w, pvalue = spstats.bartlett(*hs_test_data) output += f'Bartlett test: {w, pvalue}\n\n' results['bartlett'] = ( w, pvalue, ) else: w, pvalue = spstats.levene(*hs_test_data) output += f'Levene test: {w, pvalue}\n\n' results['levene'] = ( w, pvalue, ) if pvalue < ALPHA: output += 'HETEROSKEDASTICITY detected. Do something else\n\n' results['homoskedastic'] = False # Check that the condition number is reasonable if model_results.diagn['condno'] > 20: output += f'MULTICOLLINEARITY detected. Do something else\n\n' results['multicollinearity'] = True # If we are normal, non-multicollinear, and homoskedastic, perform ANOVA # and then multiple comparisons using Tukey's HSD. If heteroskedastic, then # we should use robust regression. Else, use a non-parametric test # TODO: Perhaps we should look into using the Wald test instead? # https://www.statsmodels.org/stable/generated/statsmodels.regression.linear_model.RegressionResults.wald_test.html if results['normal_distribution'] and results[ 'homoskedastic'] and not logit_model: o, r = test_using_anova(model, model_results, True, df, dependent_var, *independent_vars, anova_type=anova_type) output += o results.update(r) elif results['normal_distribution'] and not logit_model: model = smapi.rlm(formula, data=df) rlm_results = model.fit() output += f"{rlm_results.summary()}\n\n" results['rlm'] = rlm_results o, r = test_using_anova(model, rlm_results, False, df, dependent_var, *independent_vars, anova_type=anova_type) output += o results.update(r) elif not logit_model: o, r = test_using_kruskal(df, dependent_var, *independent_vars, correction_method=correction_method) output += o results.update(r) # Return the outputs return output, results
print(result_qr.conf_int(0.68)) # Covariance of fit parameters pcov = result_qr.cov_params() # inverse standard deviation of fit parameters pcor = np.diag(1 / np.sqrt(np.diag(pcov))) print('\n*Covariance* of fit parameters') print(result_qr.cov_params()) print('\n*Cross correlation* of fit parameters') print(result_qr.cov_params(pcor)) print('***RLM REGRESSION***') # Create regression model model_rlm = smf.rlm('y ~ 1 + t + np.square(t)', data, M=sm.robust.norms.TukeyBiweight()) # Do the regression fit result_rlm = model_rlm.fit() # Display data and best fit plt.clf() plt.plot(t, y, 'o', label='data') plt.plot(t, model.predict(result.params), label='OLS') plt.plot(t, model_qr.predict(result_qr.params), label='QuantReg') plt.plot(t, model_rlm.predict(result_rlm.params), label='RLM') plt.legend(loc='lower left')
x.append(j) y.append(diffy) ''' ax = fig.add_subplot(8, 1, i) # ax.plot(x, y, '.', alpha=0.99) ax.plot(years, means, '.', alpha=0.99) if i==0:plt.xlabel('Years after last GSS wave used') #ax.text(0.1, i, outcome, bbox={'facecolor':'red', 'alpha':0.5, 'pad':10}) formula = outcome+'~years' result = smf.ols(formula, data=pd.DataFrame({'years':x, outcome:y}).dropna(axis=0), missing='drop').fit() results[outcome] = result ax.plot(years, np.array(years)*result.params[1] + result.params[0], 'r--') # fig.savefig('test.png', dpi=100) ''' plot(years, means, '.', alpha=0.8) xlim((-1,43)) xlabel('Years after publication') ylabel('Change in ' + outcomeMap[outcome]) title(outcomeMap[outcome] + ' Over Time') formula = outcome+'~years' result = smf.rlm(formula, data=pd.DataFrame({'years':x, outcome:y}).dropna(axis=0), missing='drop').fit() plot(years, np.array(years)*result.params[1] + result.params[0], 'r--') figtext(0.2, 0.3, 'slope:'+str(np.around(result.params[1],4))+', p='+str(np.around(result.pvalues[1],3))) # figtext(0.6, 0.75, 'blue dot = model from an article') print 'intercept:'+str(result.params[0])+', '+str(result.pvalues[0]) show()
# ### Influence Plot fig, ax = plt.subplots(figsize=(8, 6)) fig = sm.graphics.influence_plot(crime_model, ax=ax) # ### Using robust regression to correct for outliers. # Part of the problem here in recreating the Stata results is that # M-estimators are not robust to leverage points. MM-estimators should do # better with this examples. from statsmodels.formula.api import rlm rob_crime_model = rlm("murder ~ urban + poverty + hs_grad + single", data=dta, M=sm.robust.norms.TukeyBiweight(3)).fit(conv="weights") print(rob_crime_model.summary()) #rob_crime_model = rlm("murder ~ pctmetro + poverty + pcths + single", # data=dta, M=sm.robust.norms.TukeyBiweight()).fit(conv="weights") #print(rob_crime_model.summary()) # There isn't yet an influence diagnostics method as part of RLM, but we # can recreate them. (This depends on the status of [issue # #888](https://github.com/statsmodels/statsmodels/issues/808)) weights = rob_crime_model.weights idx = weights > 0 X = rob_crime_model.model.exog[idx.values] ww = weights[idx] / weights[idx].mean()
ax_21 = plt.Subplot(f, gs02[1, 1]) f.add_subplot(ax_21) swarm_boxplot(ax_21, model_exp2, 'd', ' ', 2) ax_21.set_ylabel('Bucket bias\n(Bayesian model)') # --------------------------------------------------------------------------------------- # 12. Plot robust linear regression of perseveration probability on bucket-shift parameter # --------------------------------------------------------------------------------------- # Data frame for regression model data = pd.DataFrame() data['pers'] = pers_noPush['pers'].copy() data['d'] = model_exp2['d'].copy() # Robust linear regression mod = smf.rlm(formula='d ~ pers', M=sm.robust.norms.TukeyBiweight(3), data=data) res = mod.fit(conv="weights") print(res.summary()) # Plot results ax_22 = plt.Subplot(f, gs02[1, 2]) f.add_subplot(ax_22) x = pers_noPush['pers'].copy() y = model_exp2['d'].copy() ax_22.plot(x, y, '.', color='gray', alpha=0.7, markersize=2) ax_22.plot(x, res.fittedvalues, '-', label="RLM", color="k") ax_22.set_ylabel('Bucket bias\n(Bayesian model)') ax_22.set_xlabel('Estimated\nperseveration probability') ax_22.set_xticks(np.arange(0, 1, 0.2)) # --------------------------------------
def test_missing(): # see GH#2083 import statsmodels.formula.api as smf d = {'Foo': [1, 2, 10, 149], 'Bar': [1, 2, 3, np.nan]} smf.rlm('Foo ~ Bar', data=d)
x.append(yr) y.append(output[group2][yr][outcome][i]-output[group1][yr][outcome][i]) y_means.append(np.mean(y)) ''' ''' errorbar(output.index, yMeans, yerr=yErr, fmt='o-') result = smf.ols(data=pd.DataFrame({'y':Ys, 'x':Xs}), formula='y~x').fit() #result = smf.OLS(y_means,smt.add_constant(x_means)).fit() plot(years, np.array(years)*result.params[1] + result.params[0], 'r--') print 'slope:'+str(result.params[1])+', '+str(result.pvalues[1]) ''' title(outcome) xlabel('Year article was published') # plot(x, y, 'x', x_means, y_means, 'o') plot(outputRandom.index, yMeansRandom, 'ro', alpha=0.9) plot(outputActual.index, yMeansActual, 'go', alpha=0.9) #plot(Xs, Ys, 'x', alpha=0.7) resultRandom = smf.rlm(data=pd.DataFrame({'y':YsRandom, 'x':np.array(XsRandom)-1973}), formula='y~x').fit() resultActual = smf.rlm(data=pd.DataFrame({'y':YsActual, 'x':np.array(XsActual)-1973}), formula='y~x').fit() #result = smf.OLS(y_means,smt.add_constant(x_means)).fit() plot(years, (np.array(years)-1973)*resultRandom.params[1] + resultRandom.params[0], 'r--') plot(years, (np.array(years)-1973)*resultActual.params[1] + resultActual.params[0], 'g--') figtext(0.45, 0.75, 'Red: random slope:'+str(np.around(resultRandom.params[1],4))+', p='+str(np.around(resultRandom.pvalues[1],2))) figtext(0.45, 0.8, 'Green: actual slope:'+str(np.around(resultActual.params[1],4))+', p='+str(np.around(resultActual.pvalues[1],2))) figtext(0.15, 0.75, 'random int.:'+str(np.around(resultRandom.params[0],2))+', p='+str(np.around(resultRandom.pvalues[0], 2))) figtext(0.15, 0.8, 'actual int.:'+str(np.around(resultActual.params[0],2))+', p='+str(np.around(resultActual.pvalues[0], 2))) show()
ols_model = ols('prestige ~ income + education', prestige).fit() print(ols_model.summary()) print("######################") print("Built in OLS") print("######################") # now get the robust estimate using huber params, resids, squareResid, rank, s = olsModel(predictor, obs, intercept=True) print(params) print("######################") print("Checking M-estimate") print("######################") rlm_model = rlm('prestige ~ income + education', prestige, M=sm.robust.norms.HuberT(t=1.345)).fit() print(rlm_model.summary()) print("######################") print("Built in M-estimate") print("######################") params, resids, scale, weights = mestimateModel(predictor, obs, weights="huber", intercept=True) print(params) print("######################") print("Checking MAD") print("######################") np.random.seed(12345)
def agesex_adjust(self, df, sig_level=0.01): # Grab the participant data prt_data = self.participants for column in df.columns: # Merge the column with participant data sub = df.loc[:, [column]].merge(prt_data, left_index=True, right_on='username').dropna() sub.columns = ['value', 'username', 'gender', 'age', 'ancestry'] if (sub.shape[0] < 20): continue model = smf.rlm(formula='value~age', data=sub, M=statsmodels.robust.norms.TrimmedMean()) res = model.fit() if (res.pvalues['age'] < sig_level): #print "corrected age", column, res.pvalues['age'] temp = pandas.DataFrame(res.resid, columns=[column]) ## Do partial regression #temp = pandas.DataFrame(smf.ols(formula='value~age', data=sub).fit().resid, columns=[column]) # Set the index temp.index = sub['username'] # Update the original dataframe df.update(temp) # Merge the column with participant data sub = df.loc[:, [column]].merge(prt_data, left_index=True, right_on='username').dropna() sub.columns = ['value', 'username', 'gender', 'age', 'ancestry'] model = smf.rlm(formula='value~C(gender)', data=sub, M=statsmodels.robust.norms.TrimmedMean()) res = model.fit() if (res.pvalues['C(gender)[T.M]'] < sig_level): temp = pandas.DataFrame(res.resid, columns=[column]) ## Do partial regression #temp = pandas.DataFrame(smf.ols(formula='value~C(gender)', data=sub).fit().resid, columns=[column]) # Set the index temp.index = sub['username'] # Update the original dataframe df.update(temp) return df
# Data frame for regerssion model data = pd.DataFrame() data['pers'] = pers_noPush['pers'].copy() data['d'] = model_exp2['d'].copy() data['age_group'] = pers_noPush['age_group'].copy() # Recode age dummy variable in reverse order, i.e., with older adults as reference because they seem to # have the strongest effect data.loc[data['age_group'] == 3, 'age_group'] = 2 # YA in the middle data.loc[data['age_group'] == 1, 'age_group'] = 3 # CH last variable data.loc[data['age_group'] == 4, 'age_group'] = 1 # OA reference # Robust linear regression mod = smf.rlm( formula= 'd ~ pers + C(age_group, Treatment) + pers * C(age_group, Treatment)', M=sm.robust.norms.TukeyBiweight(3), data=data) res = mod.fit(conv="weights") print(res.summary()) # Plot results plt.plot(pers_noPush[pers_noPush['age_group'] == 1]['pers'].copy(), model_exp2[pers_noPush['age_group'] == 1]['d'].copy(), '.', color=colors[0], alpha=1, markersize=5) plt.plot(pers_noPush[pers_noPush['age_group'] == 3]['pers'].copy(), model_exp2[pers_noPush['age_group'] == 3]['d'].copy(), '.',
def delta_transform_agesex_adjust(self, sig_level=0.01): if self.type in ['GENOM', 'COACH']: return self.GetDataFrame() df = self.GetDataFrame() # Grab the participant data prt_data = self.participants # Build prt_data dataframe for multiple rounds prt_data1 = prt_data.copy() prt_data2 = prt_data.copy() prt_data3 = prt_data.copy() prt_data1['username'] = [ x + '_1' for x in prt_data1['username'].tolist() ] prt_data2['username'] = [ x + '_2' for x in prt_data2['username'].tolist() ] prt_data3['username'] = [ x + '_3' for x in prt_data3['username'].tolist() ] prt_data = pandas.concat([prt_data1, prt_data2, prt_data3], axis=0) for column in df.columns: # Merge the column with participant data sub = df.loc[:, [column]].merge(prt_data, left_index=True, right_on='username').dropna() sub.columns = ['value', 'username', 'gender', 'age', 'ancestry'] if (sub.shape[0] < 20): continue model = smf.rlm(formula='value~age', data=sub, M=statsmodels.robust.norms.TrimmedMean()) res = model.fit() if (res.pvalues['age'] < sig_level): #print "corrected age", column, res.pvalues['age'] temp = pandas.DataFrame(res.resid, columns=[column]) ## Do partial regression #temp = pandas.DataFrame(smf.ols(formula='value~age', data=sub).fit().resid, columns=[column]) # Set the index temp.index = sub['username'] # Update the original dataframe df.update(temp) # Merge the column with participant data sub = df.loc[:, [column]].merge(prt_data, left_index=True, right_on='username').dropna() sub.columns = ['value', 'username', 'gender', 'age', 'ancestry'] model = smf.rlm(formula='value~C(gender)', data=sub, M=statsmodels.robust.norms.TrimmedMean()) res = model.fit() if (res.pvalues['C(gender)[T.M]'] < sig_level): temp = pandas.DataFrame(res.resid, columns=[column]) ## Do partial regression #temp = pandas.DataFrame(smf.ols(formula='value~C(gender)', data=sub).fit().resid, columns=[column]) # Set the index temp.index = sub['username'] # Update the original dataframe df.update(temp) # Now split by round and calculate the delta values df['round'] = [int(x[-1]) for x in df.index.tolist()] df['username'] = [x.split('_')[0] for x in df.index.tolist()] r1 = df[(df['round'] == 1)].set_index('username').drop('round', 1) r2 = df[(df['round'] == 2)].set_index('username').drop('round', 1) r3 = df[(df['round'] == 3)].set_index('username').drop('round', 1) r1_r2 = r2 - r1 r2_r3 = r3 - r2 r1_r2.index = ["%s_1" % x for x in r1_r2.index.tolist()] r2_r3.index = ["%s_2" % x for x in r2_r3.index.tolist()] joined = pandas.concat([r1_r2, r2_r3], axis=0) return self._apply_restrictions(joined)
def fit(model, data, model_type='ols', sample_rate=.8, figsize=(10, 10), fontsize=12): """ Linear regression model with visualization of fitting parameters :param model: patsy model specification :param data: padas dataframe of results :param model_type: ols or rlm (ordinary least squares or robust linear model) :param sample_rate: float (range of 0 to 1). partions traning and testing set :param figsize: figure size of output :param fontsize: fonsize for regression output :return: tuple of axes. """ full = data.copy() mask = np.random.uniform(low=0, high=1, size=len(full)) if sample_rate < 1: train = data[mask <= sample_rate] test = data[mask > sample_rate] else: train = full y, x = patsy.dmatrices(model, train) # we never want to plot the intercept, so need to make space if it is missing has_int = 0 for name in x.design_info.column_names: if name == 'Intercept': has_int = 1 if model_type.lower() == 'ols': model = smf.ols(model, data=train,) elif model_type.lower() == 'rlm': model = smf.rlm(model, data=train) setattr(model.data.orig_exog, 'design_info', x.design_info) # some bug in patsy makes me do this... results = model.fit() # get predict values from confirmation data set. if sample_rate < 1: yfit = results.predict(test) # bug in statsmodels should be fixed in 0.7. if yfit.shape != test[model.endog_names].shape: sample_rate = 1 summ = results.summary2() var = model.endog_names # y variable name # track all categorical variables because they are separated in the design # matrix. We need to adjust the plotting later on to lump all the categoricals into one. categoricals = [c for c in x.design_info.column_names if c.startswith('C(')] cat_plots = set([re.split('[()]', x)[1] for x in categoricals]) # generate linear regression line of the acdtual vs predicted plot. # slope, intercept, r_value, p_value, std_err = sps.linregress(y[:, 0], results.fittedvalues) xs = np.linspace(np.min(y[:, 0]), np.max(y[:, 0]), 100) # determine how many columns we need on the plot r, c = np.shape(x) if len(categoricals) > 0: c = c - len(categoricals) + len(cat_plots) - has_int # make a min of 3 columns, even if there are less factors if c < 3: c = 3 fig = plt.figure(tight_layout=True, figsize=figsize) grid = gs.GridSpec(3, c) # model axis axm = fig.add_subplot(grid[0, 0:c-1]) axm.scatter(y[:, 0], results.fittedvalues, label='Train') axm.plot(xs, xs, 'k--') # plot the sampled data along with the fitted data if sample_rate < 1: axm.scatter(test[var], yfit, label='Test', color='red') axm.set_title('Actual vs Predicted Plot') axm.set_ylabel('{} Predicted'.format(var)) axm.set_xlabel(var) plt.setp(axm.xaxis.get_majorticklabels(), rotation=90) try: fig.text(.1, .9, '$R^2$ = {}\nRMSE = {}'.format(round(results.rsquared, 2), round(results.mse_total**.5, 4))) except AttributeError: pass axm.legend(scatterpoints=1, loc=4) # histogram axis axh = fig.add_subplot(grid[0, c-1]) axh.hist(list(results.resid)) axh.set_title('Model Residuals') plt.setp(axh.xaxis.get_majorticklabels(), rotation=90) # text axis axt = fig.add_subplot(grid[2, :]) axt.text(0, 1, summ.as_text(), horizontalalignment='left', verticalalignment='top', family='Courier New', fontsize=fontsize, weight='semibold' ) axt.axis('off') # plot scatter plots in factor row numcats = 0 column = 1 ski = 0 for i, factor in enumerate(x.T): if x.design_info.column_names[i] == 'Intercept': # skip the intercept ski = 1 continue if x.design_info.column_names[i].startswith('C('): numcats += 1 continue column = i - numcats - ski # draw scatter plot ax = fig.add_subplot(grid[1, column]) ax.scatter(factor, y[:, 0]) ax.set_xlabel(x.design_info.column_names[i]) plt.setp(ax.xaxis.get_majorticklabels(), rotation=90) # draw dotted linear regression line on each factor plot slope, intercept, r_value, p_value, std_err = sps.linregress(factor, y[:, 0]) xs = np.linspace(np.min(factor), np.max(factor), 100) ys = xs * slope + intercept ax.plot(xs, ys, 'r--') ax.set_ylabel(var) # plot categorical plots in factor row for i, factor in enumerate(cat_plots, start=column+1): ax = fig.add_subplot(grid[1, i]) ax = full.boxplot(column=var, by=factor, ax=ax, showfliers=False) ax.set_title('') plt.setp(ax.xaxis.get_majorticklabels(), rotation=90) fig.suptitle('') return fig, (fig.get_axes()), results
print infl.summary_frame().ix['minister'] sidak = ols_model.outlier_test('sidak') sidak.sort('unadj_p', inplace=True) print sidak fdr = ols_model.outlier_test('fdr_bh') fdr.sort('unadj_p', inplace=True) print fdr rlm_model = rlm('prestige ~ income + education', prestige).fit() print rlm_model.summary() print rlm_model.weights #### Hertzprung Russell data for Star Cluster CYG 0B1 - Leverage Points # * Data is on the luminosity and temperature of 47 stars in the direction of Cygnus. dta = sm.datasets.get_rdataset("starsCYG", "robustbase", cache=True).data from matplotlib.patches import Ellipse fig = plt.figure(figsize=(12,8))
def fit(model, data, model_type='ols', sample_rate=.8, figsize=(10, 10), fontsize=12): """ Linear regression model with visualization of fitting parameters :param model: patsy model specification :param data: padas dataframe of results :param model_type: ols or rlm (ordinary least squares or robust linear model) :param sample_rate: float (range of 0 to 1). partions traning and testing set :param figsize: figure size of output :param fontsize: fonsize for regression output :return: tuple of axes. """ full = data.copy() mask = np.random.uniform(low=0, high=1, size=len(full)) if sample_rate < 1: train = data[mask <= sample_rate] test = data[mask > sample_rate] else: train = full y, x = patsy.dmatrices(model, train) # we never want to plot the intercept, so need to make space if it is missing has_int = 0 for name in x.design_info.column_names: if name == 'Intercept': has_int = 1 if model_type.lower() == 'ols': model = smf.ols( model, data=train, ) elif model_type.lower() == 'rlm': model = smf.rlm(model, data=train) setattr(model.data.orig_exog, 'design_info', x.design_info) # some bug in patsy makes me do this... results = model.fit() # get predict values from confirmation data set. if sample_rate < 1: yfit = results.predict(test) # bug in statsmodels should be fixed in 0.7. if yfit.shape != test[model.endog_names].shape: sample_rate = 1 summ = results.summary2() var = model.endog_names # y variable name # track all categorical variables because they are separated in the design # matrix. We need to adjust the plotting later on to lump all the categoricals into one. categoricals = [ c for c in x.design_info.column_names if c.startswith('C(') ] cat_plots = set([re.split('[()]', x)[1] for x in categoricals]) # generate linear regression line of the acdtual vs predicted plot. # slope, intercept, r_value, p_value, std_err = sps.linregress(y[:, 0], results.fittedvalues) xs = np.linspace(np.min(y[:, 0]), np.max(y[:, 0]), 100) # determine how many columns we need on the plot r, c = np.shape(x) if len(categoricals) > 0: c = c - len(categoricals) + len(cat_plots) - has_int # make a min of 3 columns, even if there are less factors if c < 3: c = 3 fig = plt.figure(tight_layout=True, figsize=figsize) grid = gs.GridSpec(3, c) # model axis axm = fig.add_subplot(grid[0, 0:c - 1]) axm.scatter(y[:, 0], results.fittedvalues, label='Train') axm.plot(xs, xs, 'k--') # plot the sampled data along with the fitted data if sample_rate < 1: axm.scatter(test[var], yfit, label='Test', color='red') axm.set_title('Actual vs Predicted Plot') axm.set_ylabel('{} Predicted'.format(var)) axm.set_xlabel(var) plt.setp(axm.xaxis.get_majorticklabels(), rotation=90) try: fig.text( .1, .9, '$R^2$ = {}\nRMSE = {}'.format(round(results.rsquared, 2), round(results.mse_total**.5, 4))) except AttributeError: pass axm.legend(scatterpoints=1, loc=4) # histogram axis axh = fig.add_subplot(grid[0, c - 1]) axh.hist(list(results.resid)) axh.set_title('Model Residuals') plt.setp(axh.xaxis.get_majorticklabels(), rotation=90) # text axis axt = fig.add_subplot(grid[2, :]) axt.text(0, 1, summ.as_text(), horizontalalignment='left', verticalalignment='top', family='Courier New', fontsize=fontsize, weight='semibold') axt.axis('off') # plot scatter plots in factor row numcats = 0 column = 1 ski = 0 for i, factor in enumerate(x.T): if x.design_info.column_names[i] == 'Intercept': # skip the intercept ski = 1 continue if x.design_info.column_names[i].startswith('C('): numcats += 1 continue column = i - numcats - ski # draw scatter plot ax = fig.add_subplot(grid[1, column]) ax.scatter(factor, y[:, 0]) ax.set_xlabel(x.design_info.column_names[i]) plt.setp(ax.xaxis.get_majorticklabels(), rotation=90) # draw dotted linear regression line on each factor plot slope, intercept, r_value, p_value, std_err = sps.linregress( factor, y[:, 0]) xs = np.linspace(np.min(factor), np.max(factor), 100) ys = xs * slope + intercept ax.plot(xs, ys, 'r--') ax.set_ylabel(var) # plot categorical plots in factor row for i, factor in enumerate(cat_plots, start=column + 1): ax = fig.add_subplot(grid[1, i]) ax = full.boxplot(column=var, by=factor, ax=ax, showfliers=False) ax.set_title('') plt.setp(ax.xaxis.get_majorticklabels(), rotation=90) fig.suptitle('') return fig, (fig.get_axes()), results
student = infl.summary_frame()['student_resid'] print(student) print(student.loc[np.abs(student) > 2]) print(infl.summary_frame().loc['minister']) sidak = ols_model.outlier_test('sidak') sidak.sort_values('unadj_p', inplace=True) print(sidak) fdr = ols_model.outlier_test('fdr_bh') fdr.sort_values('unadj_p', inplace=True) print(fdr) rlm_model = rlm('prestige ~ income + education', prestige).fit() print(rlm_model.summary()) print(rlm_model.weights) # ### Hertzprung Russell data for Star Cluster CYG 0B1 - Leverage Points # * Data is on the luminosity and temperature of 47 stars in the direction # of Cygnus. dta = sm.datasets.get_rdataset("starsCYG", "robustbase", cache=True).data from matplotlib.patches import Ellipse fig = plt.figure(figsize=(12, 8)) ax = fig.add_subplot( 111,
def fit(self, df, formula): return smf.rlm(formula=formula, data=df).fit()
plt.figure(figsize=(5, 2)) seaborn.boxplot(data.nb_passengers_2001 - data.nb_passengers_2000) plt.title('NB passengers: 2001 - 2000') plt.subplots_adjust() ############################################################################## # Statistical testing: dependence of fare on distance and number of # passengers import statsmodels.formula.api as sm result = sm.ols(formula='fare ~ 1 + dist + nb_passengers', data=data_flat).fit() print(result.summary()) # Using a robust fit result = sm.rlm(formula='fare ~ 1 + dist + nb_passengers', data=data_flat).fit() print(result.summary()) ############################################################################## # Statistical testing: regression of fare on distance: 2001/2000 difference result = sm.ols(formula='fare_2001 - fare_2000 ~ 1 + dist', data=data).fit() print(result.summary()) # Plot the corresponding regression data['fare_difference'] = data['fare_2001'] - data['fare_2000'] seaborn.lmplot(x='dist', y='fare_difference', data=data) plt.show()
regress_b['Fit'] = data_b[:, 2] fig = plt.figure(1, figsize=(12, 4)) fig.suptitle('') ax1, ax2 = fig.subplots(1, 2) ax1.plot(regress_a['Temperatur'], regress_a['Spannung'], 'ro') ax1.plot(regress_a['Temperatur'], regress_a['Fit'], 'r--') ax1.plot(regress_b['Temperatur'], regress_b['Spannung'], 'bo') ax1.plot(regress_b['Temperatur'], regress_b['Fit'], 'b--') ax1.axis([-10, 110, 2, 5]) ax1.set_xlabel('Temperatur $T$ / °C') ax1.set_ylabel('Spannung $U$ / V') ax1.set_title('Robuste Regression') ax1.grid(True) """ Lineares Robuste Regression definieren und berechnen """ model = rlm("Spannung ~ Temperatur", regress_a, M=sm.robust.norms.AndrewWave()).fit() print(model.summary()) regress_a['Fit_robust'] = model.fittedvalues regress_a['Gewichtungsfaktor'] = model.weights """ Darstellung der robusten Regressionsfunktion und der Gewichte """ ax1.plot(regress_a['Temperatur'], regress_a['Fit_robust'], 'g') ax2.bar(regress_a['Temperatur'], regress_a['Gewichtungsfaktor'], 5, color='b') ax2.set_xlabel('Temperatur $T$ / °C') ax2.grid(True) ax2.set_ylabel('Gewichtungsfaktor')
import statsmodels.formula.api as smf import statsmodels.api as sm import pandas import seaborn.apionly as sns #iris = sns.load_dataset('iris') #print(iris.head()) # Fit model and print summary #rlm_model = smf.rlm(formula='sepal_length ~ sepal_width + petal_length + petal_width', data=iris, M=None) data = sm.datasets.get_rdataset('epil', package='MASS').data fam = sm.families.Poisson() ind = sm.cov_struct.Exchangeable() mod = smf.rlm("y ~ age + trt + base", data=data) r = mod.fit() print(r.summary())
student = infl.summary_frame()["student_resid"] print(student) print(student.loc[np.abs(student) > 2]) print(infl.summary_frame().loc["minister"]) sidak = ols_model.outlier_test("sidak") sidak.sort_values("unadj_p", inplace=True) print(sidak) fdr = ols_model.outlier_test("fdr_bh") fdr.sort_values("unadj_p", inplace=True) print(fdr) rlm_model = rlm("prestige ~ income + education", prestige).fit() print(rlm_model.summary()) print(rlm_model.weights) # ### Hertzprung Russell data for Star Cluster CYG 0B1 - Leverage Points # * Data is on the luminosity and temperature of 47 stars in the direction # of Cygnus. dta = sm.datasets.get_rdataset("starsCYG", "robustbase", cache=True).data from matplotlib.patches import Ellipse fig = plt.figure(figsize=(12, 8)) ax = fig.add_subplot(
def test_missing(): # see 2083 import statsmodels.formula.api as smf d = {'Foo': [1, 2, 10, 149], 'Bar': [1, 2, 3, np.nan]} mod = smf.rlm('Foo ~ Bar', data=d)
def smafit(X0, Y0, W0=None, cl=0.95, intercept=True, robust=False, rmethod='FastMCD'): """Standard Major-Axis (SMA) line fitting Calculate standard major axis, aka reduced major axis, fit to data X and Y. The main advantage of this over ordinary least squares is that the best fit of Y to X will be the same as the best fit of X to Y. The fit equations and confidence intervals are implemented following Warton et al. (2006). Robust fits use the FastMCD covariance estimate from Rousseeuw and Van Driessen (1999). While there are many alternative robust covariance estimators (e.g. other papers by D.I. Warton using M-estimators), the FastMCD algorithm is default in Matlab. When the standard error or uncertainty of each point is known, then weighted SMA may be preferrable to robust SMA. The conventional choice of weights for each point i is W_i = 1 / ( var(X_i) + var(Y_i) ), where var() is the variance (squared standard error). References Warton, D. I., Wright, I. J., Falster, D. S. and Westoby, M.: Bivariate line-fitting methods for allometry, Biol. Rev., 81(02), 259, doi:10.1017/S1464793106007007, 2006. Rousseeuw, P. J. and Van Driessen, K.: A Fast Algorithm for the Minimum Covariance Determinant Estimator, Technometrics, 41(3), 1999. Parameters ---------- X, Y : array_like Input values, Must have same length. W : optional array of weights for each X-Y point, typically W_i = 1/(var(X_i)+var(Y_i)) cl : float (default = 0.95) Desired confidence level for output. intercept : boolean (default=True) Specify if the fitted model should include a non-zero intercept. The model will be forced through the origin (0,0) if intercept=False. robust : boolean (default=False) Use statistical methods that are robust to the presence of outliers rmethod: string (default='FastMCD') Method for calculating robust variance and covariance. Options: 'MCD' or 'FastMCD' for Fast MCD 'Huber' for Huber's T: reduce, not eliminate, influence of outliers 'Biweight' for Tukey's Biweight: reduces then eliminates influence of outliers Returns ------- Slope : float Slope or Gradient of Y vs. X Intercept : float Y intercept. ste_grad : float Standard error of gradient estimate ste_int : float standard error of intercept estimate ci_grad : [float, float] confidence interval for gradient at confidence level cl ci_int : [float, float] confidence interval for intercept at confidence level cl """ import numpy as np import scipy.stats as stats from sklearn.covariance import MinCovDet import statsmodels.formula.api as smf import statsmodels.robust.norms as norms # Make sure arrays have the same length assert (len(X0) == len(Y0)), 'Arrays X and Y must have the same length' if (W0 != None): assert ( len(W0) == len(X0)), 'Array W must have the same length as X and Y' # Make sure cl is within the range 0-1 assert (cl < 1), 'cl must be less than 1' assert (cl > 0), 'cl must be greater than 0' if (W0 == None): W0 = np.zeros_like(X0) + 1 # Drop any NaN elements of X or Y # Infinite values are allowed but will make the result undefined idx = ~np.logical_or(np.isnan(X0), np.isnan(Y0)) X = X0[idx] Y = Y0[idx] W = W0[idx] # Number of observations N = len(X) # Degrees of freedom for the model if (intercept): dfmod = 2 else: dfmod = 1 # Choose whether to use methods robust to outliers if (robust): # Choose the robust method if ((rmethod.lower() == 'mcd') or (rmethod.lower() == 'fastmcd')): # FAST MCD if (not intercept): # intercept=False could possibly be supported by calculating # using mcd.support_ as weights in an explicit variance/covariance calculation raise NotImplementedError( 'FastMCD method only supports SMA with intercept') # Fit robust model of mean and covariance mcd = MinCovDet().fit(np.array([X, Y]).T) # Robust mean Xmean = mcd.location_[0] Ymean = mcd.location_[1] # Robust variance of X, Y Vx = mcd.covariance_[0, 0] Vy = mcd.covariance_[1, 1] # Robust covariance Vxy = mcd.covariance_[0, 1] # Number of observations used in mean and covariance estimate # excludes observations marked as outliers N = mcd.support_.sum() elif ((rmethod.lower() == 'biweight') or (rmethod.lower() == 'huber')): # Tukey's Biweight and Huber's T if (rmethod.lower() == 'biweight'): norm = norms.TukeyBiweight() else: norm = norms.HuberT() # Get weights for downweighting outliers # Fitting a linear model the easiest way to get these # Options include "TukeyBiweight" (totally removes large deviates) # "HuberT" (linear, not squared weighting of large deviates) rweights = smf.rlm('y~x+1', {'x': X, 'y': Y}, M=norm).fit().weights # Sum of weight and weights squared, for convienience rsum = np.sum(rweights) rsum2 = np.sum(rweights**2) # Mean Xmean = np.sum(X * rweights) / rsum Ymean = np.sum(Y * rweights) / rsum # Force intercept through zero, if requested if (not intercept): Xmean = 0 Ymean = 0 # Variance & Covariance Vx = np.sum((X - Xmean)**2 * rweights**2) / rsum2 Vy = np.sum((Y - Ymean)**2 * rweights**2) / rsum2 Vxy = np.sum((X - Xmean) * (Y - Ymean) * rweights**2) / rsum2 # Effective number of observations N = rsum else: raise NotImplementedError( "smafit.py hasn't implemented rmethod={:%s}".format(rmethod)) else: if (intercept): wsum = np.sum(W) # Average values Xmean = np.sum(X * W) / wsum Ymean = np.sum(Y * W) / wsum # Covariance matrix cov = np.cov(X, Y, ddof=1, aweights=W**2) # Variance Vx = cov[0, 0] Vy = cov[1, 1] # Covariance Vxy = cov[0, 1] else: # Force the line to pass through origin by setting means to zero Xmean = 0 Ymean = 0 wsum = np.sum(W) # Sum of squares in place of variance and covariance Vx = np.sum(X**2 * W) / wsum Vy = np.sum(Y**2 * W) / wsum Vxy = np.sum(X * Y * W) / wsum # Standard deviation Sx = np.sqrt(Vx) Sy = np.sqrt(Vy) # Correlation coefficient (equivalent to np.corrcoef()[1,0] for non-robust cases) R = Vxy / np.sqrt(Vx * Vy) ############# # SLOPE Slope = np.sign(R) * Sy / Sx # Standard error of slope estimate ste_slope = np.sqrt(1 / (N - dfmod) * Sy**2 / Sx**2 * (1 - R**2)) # Confidence interval for Slope B = (1 - R**2) / (N - dfmod) * stats.f.isf(1 - cl, 1, N - dfmod) ci_grad = Slope * (np.sqrt(B + 1) + np.sqrt(B) * np.array([-1, +1])) ############# # INTERCEPT if (intercept): Intercept = Ymean - Slope * Xmean # Standard deviation of residuals # New Method: Formula from smatr R package (Warton) # This formula avoids large residuals of outliers when using robust=True Sr = np.sqrt( (Vy - 2 * Slope * Vxy + Slope**2 * Vx) * (N - 1) / (N - dfmod)) # OLD METHOD # Standard deviation of residuals #resid = Y - (Intercept + Slope * X ) # Population standard deviation of the residuals #Sr = np.std( resid, ddof=0 ) # Standard error of the intercept estimate ste_int = np.sqrt(Sr**2 / N + Xmean**2 * ste_slope**2) # Confidence interval for Intercept tcrit = stats.t.isf((1 - cl) / 2, N - dfmod) ci_int = Intercept + ste_int * np.array([-tcrit, tcrit]) else: # Set Intercept quantities to zero Intercept = 0 ste_int = 0 ci_int = np.array([0, 0]) return Slope, Intercept, ste_slope, ste_int, ci_grad, ci_int