def model_formulas(): ''' Define models through formulas ''' # Get the data: # Development of world record times for the 100m Freestyle, for men and women. data = pd.read_csv('swim100m.csv') # Different models model1 = ols("time ~ sex", data).fit() # one factor model2 = ols("time ~ sex + year", data).fit() # two factors model3 = ols("time ~ sex * year", data).fit() # two factors with interaction # Model information print((model1.summary())) print((model2.summary())) print((model3.summary())) # ANOVAs print('----------------- Results ANOVAs: Model 1 -----------------------') print((anova_lm(model1))) print('--------------------- Model 2 -----------------------------------') print((anova_lm(model2))) print('--------------------- Model 3 -----------------------------------') model3Results = anova_lm(model3) print(model3Results) # Just to check the correct run return model3Results['F'][0] # should be 156.1407931415788
def anova_oneway(): ''' One-way ANOVA: test if results from 3 groups are equal. ''' # Get the data data = getData('altman_910.txt') # Sort them into groups, according to column 1 group1 = data[data[:,1]==1,0] group2 = data[data[:,1]==2,0] group3 = data[data[:,1]==3,0] # First, check if the variances are equal, with the "Levene"-test (W,p) = stats.levene(group1, group2, group3) if p<0.05: print('Warning: the p-value of the Levene test is <0.05: p={0}'.format(p)) # Do the one-way ANOVA F_statistic, pVal = stats.f_oneway(group1, group2, group3) # Print the results print 'Altman 910:' print (F_statistic, pVal) if pVal < 0.05: print('One of the groups is significantly different.') # Elegant alternative implementation, with pandas & statsmodels df = pd.DataFrame(data, columns=['value', 'treatment']) model = ols('value ~ C(treatment)', df).fit() print anova_lm(model)
def anova_interaction(data_lastDV): """ Two-way ANOVA and interaction analysis of given data http://statsmodels.sourceforge.net/devel/examples/generated/example_interactions.html Note: 2way ANOVAs are for 2+ categorical independent/causal variables, with 2+ levels each :param data: data frame containing the independent variables in first two columns, dependent in the third :return: None """ col_names = data_lastDV.columns.values # get the columns' names factor_groups = data_lastDV[col_names].dropna() if len(col_names) < 3: print("ERROR in statsMOOC.py: Not enough columns in dataframe to do interaction analysis: " + len(col_names)) # two-way anova formula = col_names[2] + " ~ C(" + col_names[0] + ") + C(" + col_names[1] + ")" formula_interaction = formula.replace('+', '*') interaction_lm = ols(formula, data=factor_groups).fit() # linear model print(interaction_lm.summary()) print(FORMAT_LINE) print("- " + col_names[2] + " = " + col_names[0] + " * " + col_names[1] + " Interaction -") print(anova_lm(ols(formula_interaction, data=factor_groups).fit(), interaction_lm)) print(FORMAT_LINE) print("- " + col_names[2] + " = " + col_names[0] + " + " + col_names[1] + " ANOVA -") print(anova_lm(ols(col_names[2] + " ~ C(" + col_names[0] + ")", data=factor_groups).fit(), ols(col_names[2] +" ~ C("+col_names[0]+") + C(" + col_names[1]+", Sum)", data=factor_groups).fit())) print(FORMAT_LINE) print("- " + col_names[2] + " = " + col_names[1] + " + " + col_names[0] + " ANOVA -") print(anova_lm(ols(col_names[2] + " ~ C(" + col_names[1] + ")", data=factor_groups).fit(), ols(col_names[2] +" ~ C("+col_names[0]+") + C(" + col_names[1]+", Sum)", data=factor_groups).fit()))
def model_formulas(): ''' Define models through formulas ''' # Get the dta data = read_csv(r'..\Data\data_kaplan\swim100m.csv') # Different models model1 = ols("time ~ sex", data).fit() # one factor model2 = ols("time ~ sex + year", data).fit() # two factors model3 = ols("time ~ sex * year", data).fit() # two factors with interaction # Model information print((model1.summary())) print((model2.summary())) print((model3.summary())) # ANOVAs print('-----------------------------------------------------------------') print((anova_lm(model1))) print('-----------------------------------------------------------------') print((anova_lm(model2))) print('-----------------------------------------------------------------') model3Results = anova_lm(model3) print(model3Results) # Just to check the correct run return model3Results['F'][0] # should be 156.1407931415788
def run_anova(self): ps_table_for_anova = self.ps_table[self.ps_table['Area'].isin(self.params.anova_areas)] #ps_lm = mixedlm('prob_diff ~ C(Area) * C(Pulse_Frequency)', data=ps_table_for_anova, groups=ps_table_for_anova['Subject']).fit() ps_lm = ols('prob_diff ~ C(Area) * C(Pulse_Frequency)', data=ps_table_for_anova).fit() anova = anova_lm(ps_lm) self.pass_object('fvalue_rf', anova['F'].values[0:3]) self.pass_object('pvalue_rf', anova['PR(>F)'].values[0:3]) ps_table_for_anova_low = ps_table_for_anova[ps_table_for_anova['Pulse_Frequency'].isin([10,25])] print 'nsamples =', len(ps_table_for_anova_low) ps_lm = ols('prob_diff ~ C(Area) * C(Duration)', data=ps_table_for_anova_low).fit() anova = anova_lm(ps_lm) self.pass_object('fvalue_rd_low', anova['F'].values[0:3]) self.pass_object('pvalue_rd_low', anova['PR(>F)'].values[0:3]) ps_lm = ols('prob_diff ~ C(Area) * C(Amplitude)', data=ps_table_for_anova_low).fit() anova = anova_lm(ps_lm) self.pass_object('fvalue_ra_low', anova['F'].values[0:3]) self.pass_object('pvalue_ra_low', anova['PR(>F)'].values[0:3]) ps_table_for_anova_high = ps_table_for_anova[ps_table_for_anova['Pulse_Frequency'].isin([100,200])] print 'nsamples =', len(ps_table_for_anova_high) ps_lm = ols('prob_diff ~ C(Area) * C(Duration)', data=ps_table_for_anova_high).fit() anova = anova_lm(ps_lm) self.pass_object('fvalue_rd_high', anova['F'].values[0:3]) self.pass_object('pvalue_rd_high', anova['PR(>F)'].values[0:3]) ps_lm = ols('prob_diff ~ C(Area) * C(Amplitude)', data=ps_table_for_anova_high).fit() anova = anova_lm(ps_lm) self.pass_object('fvalue_ra_high', anova['F'].values[0:3]) self.pass_object('pvalue_ra_high', anova['PR(>F)'].values[0:3])
def anova_interaction(): '''ANOVA with interaction: Measurement of fetal head circumference, by four observers in three fetuses.''' # Get the data data = getData('altman_12_6.txt') # Bring them in dataframe-format df = pd.DataFrame(data, columns=['hs', 'fetus', 'observer']) # Determine the ANOVA with interaction formula = 'hs ~ C(fetus) + C(observer) + C(fetus):C(observer)' lm = ols(formula, df).fit() print anova_lm(lm)
def anova(dv): """Perform ANOVA.""" df = make_summary() lm = ols('%s ~ C(group) * age * iq' % dv, data=df).fit() divider = '---------' print divider, dv, divider, '\n', anova_lm(lm, typ=2, robust='hc3')
def anova(df, fmla, typ=3): from statsmodels.formula.api import ols from statsmodels.stats.anova import anova_lm # Anova/OLS lm = ols(fmla, df).fit() # 'data' <==> 'df' keyword change with version # Grab the pvalues (note we use Type III) aov = anova_lm(lm, typ=typ) pvals = aov["PR(>F)"] pvals.index = map(lambda s: "p_" + s, pvals.index) # Grab the explainable sum of squares ess = aov.drop("Residual").sum_sq ess = ess / ess.sum() ess.index = map(lambda s: "ess_" + s, ess.index) # Grab the fit fit = lm.params fit.index = map(lambda s: "fit_" + s, fit.index) # I think this happens with pathological inputs if np.any(aov["sum_sq"] < 0): 1 / 0 return {"lm": lm, "aov": aov, "pvals": pvals, "ess": ess, "fit": fit}
def test_results(self): new_model = ols("np.log(Days+1) ~ C(Duration) + C(Weight)", self.data).fit() results = anova_lm(new_model, self.kidney_lm) Res_Df = np.array([ 56, 54 ]) RSS = np.array([ 29.62486, 28.9892 ]) Df = np.array([ 0, 2 ]) Sum_of_Sq = np.array([ np.nan, 0.6356584 ]) F = np.array([ np.nan, 0.5920404 ]) PrF = np.array([ np.nan, 0.5567479 ]) np.testing.assert_equal(results["df_resid"].values, Res_Df) np.testing.assert_almost_equal(results["ssr"].values, RSS, 4) np.testing.assert_almost_equal(results["df_diff"].values, Df) np.testing.assert_almost_equal(results["ss_diff"].values, Sum_of_Sq) np.testing.assert_almost_equal(results["F"].values, F) np.testing.assert_almost_equal(results["Pr(>F)"].values, PrF)
def startanova(self): from urllib2 import urlopen import numpy as np import pandas import matplotlib.pyplot as plt from statsmodels.formula.api import ols from statsmodels.graphics.api import interaction_plot, abline_plot from statsmodels.stats.anova import anova_lm try: rehab_table = pandas.read_csv('rehab.table') except: url = 'http://stats191.stanford.edu/data/rehab.csv' #the next line is not necessary with recent version of pandas url = urlopen(url) rehab_table = pandas.read_table(url, delimiter=",") rehab_table.to_csv('rehab.table') print rehab_table plt.figure(figsize=(6, 6)); rehab_table.boxplot('Time', 'Fitness', ax=plt.gca()) rehab_lm = ols('Time ~ C(Fitness)', data=rehab_table).fit() table9 = anova_lm(rehab_lm,test=self.test,robust=self.robust) print table9 print rehab_lm.model.data.orig_exog print rehab_lm.summary() plt.show()
def doAnova(data): '''one-way ANOVA''' df = pd.DataFrame(data) model = ols('StressReduction ~ C(Treatment)',df).fit() anovaResults = anova_lm(model) print(anovaResults) if anovaResults['PR(>F)'][0] < 0.05: print('One of the groups is different.')
def anova_statsmodels(): """ do the ANOVA with a function """ # Get the data data = pd.read_csv(r"..\Data\data_kaplan\galton.csv") anova_results = anova_lm(ols("height ~ 1 + sex", data).fit()) print('\nANOVA with "statsmodels" ------------------------------') print(anova_results) return anova_results["F"][0]
def anova_statsmodels(): ''' do the ANOVA with a function ''' # Get the data data = pd.read_csv(r'..\Data\data_kaplan\galton.csv') anova_results = anova_lm(ols('height ~ 1 + sex', data).fit()) print('\nANOVA with "statsmodels" ------------------------------') print(anova_results) return anova_results['F'][0]
def test_results(self): Df = np.array([2, 2, 2, 54]) sum_sq = np.array([158.6415227, 16.97129, 0.6356584, 28.9892]) mean_sq = np.array([79.3207613, 8.485645, 0.3178292, 0.536837]) f_value = np.array([147.7557648, 15.80674, 0.5920404, np.nan]) pr_f = np.array([1.262324e-22, 3.944502e-06, 0.5567479, np.nan]) results = anova_lm(self.kidney_lm) np.testing.assert_equal(results["df"].values, Df) np.testing.assert_almost_equal(results["sum_sq"].values, sum_sq, 4) np.testing.assert_almost_equal(results["F"].values, f_value, 4) np.testing.assert_almost_equal(results["PR(>F)"].values, pr_f)
def test_results(self): Df = np.array([1, 2, 2, 54]) sum_sq = np.array([2.339693, 16.97129, 0.6356584, 28.9892]) mean_sq = np.array([2.339693, 8.485645, 0.3178292, 0.536837]) f_value = np.array([4.358293, 15.80674, 0.5920404, np.nan]) pr_f = np.array([0.0415617, 3.944502e-06, 0.5567479, np.nan]) results = anova_lm(self.kidney_lm) np.testing.assert_equal(results['df'].values, Df) np.testing.assert_almost_equal(results['sum_sq'].values, sum_sq, 4) np.testing.assert_almost_equal(results['F'].values, f_value, 4) np.testing.assert_almost_equal(results['PR(>F)'].values, pr_f)
def one_stats(data_lastDV): """ Do basic analysis of one IV onto one DV :param data: pandas dataframe we are exploring (IV-of-interest in first column, followed by IVs, and DV in last index) :return: None """ col_names = data_lastDV.columns.values.tolist() # get the columns' names causal = col_names.pop(0) # first item is the topic outcome = col_names.pop() # remove the last item in the list topic_data = data_lastDV[[causal, outcome]] # descriptive stats print(FORMAT_LINE) print(topic_data[causal].describe()) print(FORMAT_LINE) fig = plt.figure() # bar chart of topics ax1 = fig.add_subplot(121) df_compare = topic_data.groupby(causal)[causal].count() # displays num instances assigned to each condition ax1 = df_compare.plot(kind='bar', title=causal) ax1.set_xlabel(causal) ax1.set_ylabel("count instances") # scatter plot ax2 = fig.add_subplot(122) df_compare = data_lastDV.groupby(causal)[outcome].mean() # displays num helpers selected in each topic ax2 = df_compare.plot(kind='bar', title=causal) ax2.set_xlabel(causal) ax2.set_ylabel("mean " + outcome) plt.show() # One Way ANOVA cond_lm = ols(outcome + " ~ C(" + causal + ")", data=topic_data).fit() anova_table = anova_lm(cond_lm) print("\n"+FORMAT_LINE) print("One-Way ANOVA: " + causal + " --> " + outcome) print(FORMAT_LINE) print(anova_table) #print(cond_lm.model.data.orig_exog) print(cond_lm.summary()) # boxplot of topics --> num helpers selected fig = plt.figure() ax = fig.add_subplot(111) ax = topic_data.boxplot(outcome, causal, ax=plt.gca()) ax.set_xlabel(causal) ax.set_ylabel(outcome) plt.show() for cond in col_names: anova_interaction(data_lastDV[[causal, cond, outcome]]) plot_interaction(data_lastDV[[causal, cond, outcome]])
def test_results(self): data = self.data.drop([0, 1, 2]) anova_ii = ols("np.log(Days+1) ~ C(Duration, Sum)*C(Weight, Sum)", data).fit() Sum_Sq = np.array([151.4065, 2.904723, 13.45718, 0.1905093, 27.60181]) Df = np.array([1, 2, 2, 51]) F = np.array([6.972744, 13.7804, 0.1709936, np.nan]) PrF = np.array([0.01095599, 1.641682e-05, 0.8433081, np.nan]) results = anova_lm(anova_ii, typ="II", robust="hc0") np.testing.assert_equal(results["df"].values, Df) # np.testing.assert_almost_equal(results['sum_sq'].values, Sum_Sq, 4) np.testing.assert_almost_equal(results["F"].values, F, 4) np.testing.assert_almost_equal(results["PR(>F)"].values, PrF)
def test_results(self): data = self.data.drop([0, 1, 2]) anova_iii = ols("np.log(Days+1) ~ C(Duration, Sum)*C(Weight, Sum)", data).fit() Sum_Sq = np.array([151.4065, 2.904723, 13.45718, 0.1905093, 27.60181]) Df = np.array([1, 1, 2, 2, 51]) F_value = np.array([279.7545, 5.367071, 12.43245, 0.1760025, np.nan]) PrF = np.array([2.379855e-22, 0.02457384, 3.999431e-05, 0.8391231, np.nan]) results = anova_lm(anova_iii, typ="III") np.testing.assert_equal(results["df"].values, Df) np.testing.assert_almost_equal(results["sum_sq"].values, Sum_Sq, 4) np.testing.assert_almost_equal(results["F"].values, F_value, 4) np.testing.assert_almost_equal(results["PR(>F)"].values, PrF)
def test_results(self): data = self.data.drop([0, 1, 2]) anova_iii = ols("np.log(Days+1) ~ C(Duration, Sum)*C(Weight, Sum)", data).fit() Sum_Sq = np.array([151.4065, 2.904723, 13.45718, 0.1905093, 27.60181]) Df = np.array([1, 1, 2, 2, 51]) F = np.array([298.3404, 5.723638, 13.76069, 0.1709936, np.nan]) PrF = np.array([5.876255e-23, 0.02046031, 1.662826e-05, 0.8433081, np.nan]) results = anova_lm(anova_iii, typ="III", robust="hc0") np.testing.assert_equal(results["df"].values, Df) # np.testing.assert_almost_equal(results['sum_sq'].values, Sum_Sq, 4) np.testing.assert_almost_equal(results["F"].values, F, 4) np.testing.assert_almost_equal(results["PR(>F)"].values, PrF)
def test_results(self): data = self.data.drop([0, 1, 2]) anova_iii = ols("np.log(Days+1) ~ C(Duration, Sum)*C(Weight, Sum)", data).fit() Sum_Sq = np.array([151.4065, 2.904723, 13.45718, 0.1905093, 27.60181]) Df = np.array([1, 1, 2, 2, 51]) F = np.array([266.9361, 5.12115, 12.3122, 0.1529943, np.nan]) PrF = np.array([6.54355e-22, 0.02792296, 4.336712e-05, 0.858527, np.nan]) results = anova_lm(anova_iii, typ="III", robust="hc1") np.testing.assert_equal(results["df"].values, Df) # np.testing.assert_almost_equal(results['sum_sq'].values, Sum_Sq, 4) np.testing.assert_almost_equal(results["F"].values, F, 4) np.testing.assert_almost_equal(results["PR(>F)"].values, PrF)
def test_results(self): data = self.data.drop([0, 1, 2]) anova_iii = ols("np.log(Days+1) ~ C(Duration, Sum)*C(Weight, Sum)", data).fit() Sum_Sq = np.array([151.4065, 2.904723, 13.45718, 0.1905093, 27.60181]) Df = np.array([1, 1, 2, 2, 51]) F = np.array([264.5137, 5.074677, 12.19158, 0.1501224, np.nan]) PrF = np.array([7.958286e-22, 0.02860926, 4.704831e-05, 0.8609815, np.nan]) results = anova_lm(anova_iii, typ="III", robust="hc2") np.testing.assert_equal(results["df"].values, Df) # np.testing.assert_almost_equal(results['sum_sq'].values, Sum_Sq, 4) np.testing.assert_almost_equal(results["F"].values, F, 4) np.testing.assert_almost_equal(results["PR(>F)"].values, PrF)
def test_results(self): data = self.data.drop([0, 1, 2]) anova_iii = ols("np.log(Days+1) ~ C(Duration, Sum)*C(Weight, Sum)", data).fit() Sum_Sq = np.array([151.4065, 2.904723, 13.45718, 0.1905093, 27.60181]) Df = np.array([1, 1, 2, 2, 51]) F = np.array([234.4026, 4.496996, 10.79903, 0.1317223, np.nan]) PrF = np.array([1.037224e-20, 0.03883841, 0.0001228716, 0.8768817, np.nan]) results = anova_lm(anova_iii, typ="III", robust="hc3") np.testing.assert_equal(results["df"].values, Df) # np.testing.assert_almost_equal(results['sum_sq'].values, Sum_Sq, 4) np.testing.assert_almost_equal(results["F"].values, F, 4) np.testing.assert_almost_equal(results["PR(>F)"].values, PrF)
def test_results(self): data = self.data.drop([0, 1, 2]) anova_ii = ols("np.log(Days+1) ~ C(Duration, Sum)*C(Weight, Sum) - 1", data).fit() Sum_Sq = np.array([154.7131692, 13.27205, 0.1905093, 27.60181]) Df = np.array([2, 2, 2, 51]) F_value = np.array([142.9321191, 12.26141, 0.1760025, np.nan]) PrF = np.array([1.238624e-21, 4.487909e-05, 0.8391231, np.nan]) results = anova_lm(anova_ii, typ="II") np.testing.assert_equal(results["df"].values, Df) np.testing.assert_almost_equal(results["sum_sq"].values, Sum_Sq, 4) np.testing.assert_almost_equal(results["F"].values, F_value, 4) np.testing.assert_almost_equal(results["PR(>F)"].values, PrF)
def anova_oneway(): ''' One-way ANOVA: test if results from 3 groups are equal. Twenty-two patients undergoing cardiac bypass surgery were randomized to one of three ventilation groups: Group I: Patients received 50% nitrous oxide and 50% oxygen mixture continuously for 24 h. Group II: Patients received a 50% nitrous oxide and 50% oxygen mixture only dirng the operation. Group III: Patients received no nitrous oxide but received 35-50% oxygen for 24 h. The data show red cell folate levels for the three groups after 24h' ventilation. ''' # Get the data print('One-way ANOVA: -----------------') inFile = 'altman_910.txt' data = np.genfromtxt(inFile, delimiter=',') # Sort them into groups, according to column 1 group1 = data[data[:,1]==1,0] group2 = data[data[:,1]==2,0] group3 = data[data[:,1]==3,0] # --- >>> START stats <<< --- # First, check if the variances are equal, with the "Levene"-test (W,p) = stats.levene(group1, group2, group3) if p<0.05: print(('Warning: the p-value of the Levene test is <0.05: p={0}'.format(p))) # Do the one-way ANOVA F_statistic, pVal = stats.f_oneway(group1, group2, group3) # --- >>> STOP stats <<< --- # Print the results print('Data form Altman 910:') print((F_statistic, pVal)) if pVal < 0.05: print('One of the groups is significantly different.') # Elegant alternative implementation, with pandas & statsmodels df = pd.DataFrame(data, columns=['value', 'treatment']) model = ols('value ~ C(treatment)', df).fit() anovaResults = anova_lm(model) print(anovaResults) # Check if the two results are equal. If they are, there is no output np.testing.assert_almost_equal(F_statistic, anovaResults['F'][0]) return (F_statistic, pVal) # should be (3.711335988266943, 0.043589334959179327)
def do_ANOVA(data): '''4.3.2. Perform an ANOVA on the data''' print('ANOVA: ----------------------------------------------') # First, I fit a statistical "ordinary least square (ols)"-model to the data, using the # formula language from "patsy". The formula 'weight ~ C(group)' says: # "weight" is a function of the categorical value "group" # and the data are taken from the DataFrame "data", which contains "weight" and "group" model = ols('weight ~ C(group)', data).fit() # "anova_lm" (where "lm" stands for "linear model") extracts the ANOVA-parameters # from the fitted model. anovaResults = anova_lm(model) print(anovaResults) if anovaResults['PR(>F)'][0] < 0.05: print('One of the groups is different.')
def anova_interaction(): '''ANOVA with interaction: Measurement of fetal head circumference, by four observers in three fetuses, from a study investigating the reproducibility of ultrasonic fetal head circumference data.''' # Get the data data = getData('altman_12_6.txt', subDir='..\Data\data_altman') # Bring them in dataframe-format df = pd.DataFrame(data, columns=['hs', 'fetus', 'observer']) # --- >>> START stats <<< --- # Determine the ANOVA with interaction formula = 'hs ~ C(fetus) + C(observer) + C(fetus):C(observer)' lm = ols(formula, df).fit() anovaResults = anova_lm(lm) # --- >>> STOP stats <<< --- print(anovaResults) return anovaResults['F'][0]
def anova_interaction(): """ANOVA with interaction: Measurement of fetal head circumference, by four observers in three fetuses, from a study investigating the reproducibility of ultrasonic fetal head circumference data. """ # Get the data data = getData("altman_12_6.txt", subDir="..\Data\data_altman") # Bring them in DataFrame-format df = pd.DataFrame(data, columns=["hs", "fetus", "observer"]) # --- >>> START stats <<< --- # Determine the ANOVA with interaction formula = "hs ~ C(fetus) + C(observer) + C(fetus):C(observer)" lm = ols(formula, df).fit() anovaResults = anova_lm(lm) # --- >>> STOP stats <<< --- print(anovaResults) return anovaResults["F"][0]
def ancova(data_covar_lastDV): """ ANCOVA for when you have a numerical covariate to control for. Read more about ANOVA/ANCOVA/etc here: http://www.statsmakemecry.com/smmctheblog/stats-soup-anova-ancova-manova-mancova http://elderlab.yorku.ca/~elder/teaching/psyc3031/lectures/Lecture%207%20Analysis%20of%20Covariance%20-%20ANCOVA%20%28GLM%202%29.pdf (slide 24) :param data: data frame containing the independent and dependent variables (covariate is second to last, DV is last item in list) :return: None """ col_names = data_covar_lastDV.columns.values.tolist() # get the columns' names outcome = col_names.pop() # remove the last item in the list covariate = col_names.pop() # remove the [second to] last item in the list fig = plt.figure() i = 1 for cond in col_names: cond_table = data_covar_lastDV[[cond, covariate, outcome]].dropna() cond_lm = ols(outcome + " ~ " + covariate + " + " + cond, data=cond_table).fit() anova_table = anova_lm(cond_lm) print("\n"+FORMAT_LINE) print("ANCOVA: " + cond + " + " + covariate + " --> " + outcome) print(FORMAT_LINE) print(anova_table) #print(cond_lm.model.data.orig_exog) print(cond_lm.summary()) ax = fig.add_subplot(1,2, i) ax = cond_table.boxplot(outcome, cond, ax=plt.gca()) ax.set_xlabel(cond) ax.set_ylabel(outcome) i += 1 # box plot # TODO: need to remove the effect of the covariate before plotting # http://statsmodels.sourceforge.net/devel/examples/generated/example_interactions.html user_input = input(">> Display boxplot of conditions? [y/n]: ") if is_yes(user_input): fig.tight_layout() plt.show()
def anova_oneway(): ''' One-way ANOVA: test if results from 3 groups are equal. ''' # Get the data print('One-way ANOVA: -----------------') data = getData('altman_910.txt', subDir='..\Data\data_altman') # Sort them into groups, according to column 1 group1 = data[data[:, 1] == 1, 0] group2 = data[data[:, 1] == 2, 0] group3 = data[data[:, 1] == 3, 0] # First, check if the variances are equal, with the "Levene"-test (W, p) = stats.levene(group1, group2, group3) if p < 0.05: print('Warning: the p-value of the Levene test is <0.05: p={0}'.format( p)) # Do the one-way ANOVA F_statistic, pVal = stats.f_oneway(group1, group2, group3) # Print the results print('Data form Altman 910:') print((F_statistic, pVal)) if pVal < 0.05: print('One of the groups is significantly different.') # Elegant alternative implementation, with pandas & statsmodels df = pd.DataFrame(data, columns=['value', 'treatment']) model = ols('value ~ C(treatment)', df).fit() anovaResults = anova_lm(model) print(anovaResults) # Check if the two results are equal. If they are, there is no output np.testing.assert_almost_equal(F_statistic, anovaResults['F'][0]) return (F_statistic, pVal) # should be (3.711335988266943, 0.043589334959179327)
def test_results(self): data = self.data.drop([0,1,2]) anova_ii = ols("np.log(Days+1) ~ C(Duration, Sum)*C(Weight, Sum)", data).fit() Sum_Sq = np.array([ 151.4065, 2.904723, 13.45718, 0.1905093, 27.60181 ]) Df = np.array([ 1, 2, 2, 51 ]) F = np.array([ 5.633786, 10.89842, 0.1317223, np.nan ]) PrF = np.array([ 0.02142223, 0.0001145965, 0.8768817, np.nan ]) results = anova_lm(anova_ii, typ="II", robust="hc3") np.testing.assert_equal(results['df'].values, Df) #np.testing.assert_almost_equal(results['sum_sq'].values, Sum_Sq, 4) np.testing.assert_almost_equal(results['F'].values, F, 4) np.testing.assert_almost_equal(results['PR(>F)'].values, PrF)
# l=[] # for x in range(80): # l.append([1,1]) # for y in range(1): # l.append([2,1]) # exog=np.array(l) # q,r = np.linalg.qr(exog) # print(np.dot(q,r)) a = np.array([[-0.5, -0.5, -0.5, -0.5], [-0.5, -0.5, 0.5, 0.5]]) b = np.array([2, 2, 2, 2]) print(a, b) print(np.dot(a, b)) # coh_list=[0,1,3,3] coh_list = [] for x in range(4): coh_list.append(2) id = [] for x in range(2): id.append(2) for x in range(2): id.append(1) data = {'id': id, 'coherence': coh_list} df = pd.DataFrame(data) print(df) print(ols('coherence ~ C(id)', df).fit()) anova_res = anova_lm(ols('coherence ~ C(id)', df).fit(), typ=1) print(anova_res) print(type(anova_res.loc['C(id)']['PR(>F)']))
fig, ax = plot_data(jobtest_table) fig = abline_plot(intercept=lm4.params['Intercept'], slope=lm4.params['TEST'], ax=ax, color='purple') fig = abline_plot(intercept=lm4.params['Intercept'] + lm4.params['MINORITY'], slope=lm4.params['TEST'] + lm4.params['TEST:MINORITY'], ax=ax, color='green') plt.title("JPERF ~ TEST * TEST:MINORITY") plt.show() # is there any effect of MINORITY on slope or intercept? table = anova_lm(lm, lm4) print("TEST vs. TEST * MINORITY") print(table) print("\n") """ TEST vs. TEST * MINORITY ======================== df_resid ssr df_diff ss_diff F Pr(>F) 0 18.0 45.568297 0.0 NaN NaN NaN 1 16.0 31.655473 2.0 13.912824 3.516061 0.054236 """ # is there any effect of MINORITY on slope # NOTE: assumption. the slope is the same within each group table = anova_lm(lm, lm3) print("TEST vs. TEST:MINORITY")
from mpl_toolkits.mplot3d import Axes3D import matplotlib.pyplot as plt import pandas as pd import numpy as np import statsmodels.formula.api as an from statsmodels.stats.anova import anova_lm a = pd.DataFrame({ "Fertilizer": [100, 200, 300, 400, 500, 600, 700], "Rainfall": [10, 20, 10, 30, 20, 20, 30], "Yield": [40, 50, 50, 70, 65, 65, 80] }) result = an.ols(formula="Yield ~ Fertilizer + Rainfall", data=a).fit() print result.params print(anova_lm(result)) print(result.summary()) fig = plt.figure() axis = fig.add_subplot(111, projection='3d') axis.scatter(a['Fertilizer'], a['Rainfall'], a['Yield'], c='r', marker='o') xx, yy = np.meshgrid(a['Fertilizer'], a['Rainfall']) exog = pd.core.frame.DataFrame({ 'Fertilizer': xx.ravel(), 'Rainfall': yy.ravel() }) out = result.predict(exog=exog) axis.plot_surface(xx, yy, out.values.reshape(xx.shape), rstride=1, cstride=1,
# print(df.columns.tolist()) # 2、数据处理 # 1)标识属性 catCols = ['季度'] intCols = ['GNP', '失业率', '利率'] target = '销量' # 2)特征选择(协方差分析) import statsmodels.formula.api as smf import statsmodels.stats.anova as sms cols = catCols + intCols formula = '{} ~ {}'.format(target, '+'.join(cols)) module = smf.ols(formula, df).fit() dfanova = sms.anova_lm(module) cond = dfanova['PR(>F)'] < 0.05 cols = dfanova[cond].index.tolist() print('显著影响的因素:', cols) # 去除无显著影响的因素 for col in catCols: if col not in cols: catCols.remove(col) for col in intCols: if col not in cols: intCols.remove(col) # # 3)如果要预测,则需要错位/移位,并去除首行 # shiftCols = ['GNP','失业率','利率']
def run(self, dfx, tsy): tsy = tsy.reset_index(drop=True) dfx = dfx.reset_index(drop=True) msg = {} xl = len(dfx) yl = len(tsy) if xl != yl: logging.error( 'the length of input X:%s is not equal the length of Y: %s ! ' % (xl, yl)) msg['error'] = '输入的dfx的长度为:%s 不等于输入的tsy的长度: %s ' % (xl, yl) return {'result': pd.DataFrame(), 'msg': msg} if not isSeries(tsy) or not isCategory(tsy): logging.error( 'input tsy is not a pandas Series or not a category data!') msg['error'] = '输入的tsy不是定类型数据或者Series类型' return {'result': pd.DataFrame(), 'msg': msg} else: x_numer_cols, x_cate_cols = ParseDFtypes(dfx) if x_numer_cols == []: logging.error( 'All input dfx are no numeric columns, Please check your input dfx data!' ) msg['error'] = '输入的dfx所有的列都不是数值型数据,请检查输入数据' return {'result': pd.DataFrame(), 'msg': msg} else: if x_cate_cols != []: logging.warning( 'input dfx has non-numeric columns: %s, will ignore these columns!' % x_cate_cols) msg['warning'] = '输入的dfx包含了非数值型的列: %s, 将会被自动忽略!' % x_cate_cols name = tsy.name dfu = dfx[x_numer_cols].join(tsy) m = dfu.groupby(name).mean().T s = dfu.groupby(name).std().T def change(ts): v = [] for i in ts.index: r = '%s±%s' % (round(ts.loc[i], 2), round(s[ts.name].loc[i], 2)) v.append(r) return pd.Series(v, index=ts.index) m1 = m.apply(change) rs = [] for i in x_numer_cols: model = ols('%s ~ %s' % (i, tsy.name), dfu).fit() anovat = anova_lm(model) anovat.columns = ['自由度', '平方和', '均方和', 'F-值', 'p-值'] rs.append(anovat.iloc[0].to_frame(name=i).T) res = m1.join(pd.concat(rs)) res['p-值'] = res['p-值'].apply(lambda x: '{:.5f}'.format(x)) return {'result': res.round(5), 'msg': msg}
'treat1':list(np.random.normal(15,5,100)), \ 'treat2':list(np.random.normal(20,5,100)), \ 'treat3':list(np.random.normal(30,5,100)), \ 'treat4':list(np.random.normal(31,5,100))} #组合成数据框 import pandas as pd df = pd.DataFrame(df) df.head() df.boxplot(grid=False) import matplotlib.pyplot as plt plt.show() #数据格式整理为一列为处理,一列为数值的形式 df_melt = df.melt() df_melt.head() df_melt.columns = ['Treat', 'Value'] df_melt.head() from statsmodels.formula.api import ols from statsmodels.stats.anova import anova_lm model = ols('Value~C(Treat)', data=df_melt).fit() anova_table = anova_lm(model, typ=2) print(anova_table) import seaborn as sns sns.boxplot(x='Treat', y='Value', data=df_melt) #参考https://zhuanlan.zhihu.com/p/91031244
print(model.summary()) ## 新增列求月份值及数据预处理 data1['month'] = data1['date'].dt.month ''' 问题二: 按照α = 0.05,由此可看出: Fr(2.467) < Fa 或者P(0.124) > 0.05,因此原假设H0成立,月份对销售量没影响 ''' formula2='value~month' model2=ols(formula2,data=data1).fit() print(model2.summary()) model2_2= ols(formula2,data1).fit() anovat=anova_lm(model2_2) print(anovat) ''' 问题二: 按照α = 0.05,由此可看出: 1.地区:P(0.020985) <0.05,因此拒绝H0,地区对销售量有显著性影响 2.月份:P(0.094860) >0.05,因此不能拒绝H0,月份对销售量没显著性影响 ''' formula3='value~month + item' model3=ols(formula3,data=data1).fit() anovat3=anova_lm(model3) print(anovat3)
np.random.seed(1) # normal distributed noise y = -5 + 3*x + 4 * np.random.normal(size=x.shape) # Create a data frame containing all the relevant variables data = pd.DataFrame({'x': x, 'y': y}) plt.figure(figsize=(5, 4)) plt.plot(data["x"],data["y"], 'o') from statsmodels.formula.api import ols model = ols("y ~ x", data).fit() print(model.summary()) # Peform analysis of variance on fitted linear model from statsmodels.stats.anova import anova_lm anova_results = anova_lm(model) print(anova_results) # Retrieve the parameter estimates beta_0, beta_1 = model._results.params plt.plot(x, x*beta_1 + beta_0) plt.xlabel('x') plt.ylabel('y') #data with linear model depicited run all lines together plt.plot(data["x"],data["y"], 'o') plt.plot(x, x*beta_1 + beta_0,color="black") plt.xlabel('x') plt.ylabel('y')
def get_fairness_analyses(df, group, system_score_column, human_score_column='sc1', base_group=None): """ Compute fairness analyses described in `Loukina et al. 2019 <https://www.aclweb.org/anthology/W19-4401/>`_. The function computes how much variance group membership explains in overall score accuracy (osa), overall score difference (osd), and conditional score difference (csd). See the paper for more details. Parameters ---------- df: pandas DataFrame A dataframe containing columns with numeric human scores, columns with numeric system scores and a column with group membership. group: str Name of the column containing group membership. system_score_column: str Name of the column containing system scores. human_score_column: str Name of the column containing human scores. base_group: str, optional Name of the group to use as the reference category. Defaults to ``None`` in which case the group with the largest number of cases will be used as the reference category. Ties are broken alphabetically. Returns ------- model_dict: dictionary A dictionary with different proposed metrics as keys and fitted models as values. fairness_container: DataContainer A datacontainer with the following datasets: - "estimates_<METRIC>_by_<GROUP>" where "<GROUP>" corresponds to the given group and "<METRIC>" can be "osa", "osd" and "csd" estimates for each group computed by the respective models. - "fairness_metrics_by_<GROUP>" - a summary of model fits (R2 and p values). """ # compute error and squared error df['error'] = df[system_score_column] - df[human_score_column] df['SE'] = df['error']**2 # convert group values to category and reorder them using # the largest category as reference df['group'] = convert_to_ordered_category(df[group], base_group=base_group) base_group = df['group'].cat.categories[0] df['sc1_cat'] = convert_to_ordered_category(df[human_score_column]) # Overall score accuracy (OSA) # Variance in squared error explained by L1 # fit the model osa_model = smf.ols(formula='SE ~ group', data=df) osa_fit = osa_model.fit() # collect the results osa_dict = {'R2': osa_fit.rsquared_adj, 'sig': osa_fit.f_pvalue} osa_results = pd.Series(osa_dict, name='Overall score accuracy') df_coefficients_osa = get_coefficients(osa_fit, base_group) # Overall score difference (OSD) # variance in signed residuals (raw error) explained by L1 # fit the model osd_model = smf.ols(formula='error ~ group', data=df) osd_fit = osd_model.fit() # collect the results osd_dict = {'R2': osd_fit.rsquared_adj, 'sig': osd_fit.f_pvalue} osd_results = pd.Series(osd_dict, name='Overall score difference') df_coefficients_osd = get_coefficients(osd_fit, base_group) # conditional score difference CSD # Variance in score difference conditioned on Native language # fit "null" model with human score only csd_null_mod = smf.ols(formula='error ~ sc1_cat', data=df) csd_null_fit = csd_null_mod.fit() # fit model with both human score and group csd_mod = smf.ols(formula='error ~ group + sc1_cat', data=df) csd_fit = csd_mod.fit() # compare the two models using anova_lm # we filter warnings for this function because we get # runtime warning due to NaNs in the data. # these seem to be by design: https://groups.google.com/forum/#!topic/pystatsmodels/-flY0cNnb3k np.warnings.filterwarnings('ignore') anova_results = anova_lm(csd_null_fit, csd_fit) # we reset warnings np.warnings.resetwarnings() # collect the results. Note that R2 in this case is a difference # in R2 between the two models and significance is obtained from anova csd_dict = { 'R2': csd_fit.rsquared_adj - csd_null_fit.rsquared_adj, 'sig': anova_results.values[1][-1] } csd_results = pd.Series(csd_dict, name='Conditional score difference') df_coefficients_csd = get_coefficients(csd_fit, base_group) # create a summary table df_r2_all = pd.concat([osa_results, osd_results, csd_results], axis=1, sort=True) df_r2_all['base_category'] = base_group # assemble all datasets into a DataContainer datasets = [{ 'name': 'estimates_osa_by_{}'.format(group), 'frame': df_coefficients_osa }, { 'name': 'estimates_osd_by_{}'.format(group), 'frame': df_coefficients_osd }, { 'name': 'estimates_csd_by_{}'.format(group), 'frame': df_coefficients_csd }, { 'name': 'fairness_metrics_by_{}'.format(group), 'frame': df_r2_all }] # assemble all models into a dictionary model_dict = {'osa': osa_fit, 'osd': osd_fit, 'csd': csd_fit} return model_dict, DataContainer(datasets=datasets)
def _oneway_anova(table, response_cols, factor_col): rb = BrtcReprBuilder() rb.addMD( strip_margin(""" ## One-way Analysis of Variance Result """)) groups = table[factor_col].unique() groups.sort() sum_len = np.sum([len(str(group)) for group in groups]) result = dict() result['_grouped_data'] = dict() for response_col in response_cols: data = table[response_col] result['_grouped_data'][response_col] = dict() ax = sns.boxplot(x=factor_col, y=response_col, data=table, order=groups) if sum_len > 512: ax.set_xticklabels(ax.get_xticklabels(), rotation=90) elif sum_len > 64: ax.set_xticklabels(ax.get_xticklabels(), rotation=45) fig_box = plt2MD(plt) plt.clf() model = ols( """Q('{response_col}') ~ C(Q('{factor_col}'))""".format( response_col=response_col, factor_col=factor_col), table).fit() # TODO factor_col = class => error anova = anova_lm(model) index_list = anova.index.tolist() remove_list = ["C(Q('", "'))", "Q('", "')"] for v in remove_list: index_list = [i.replace(v, "") for i in index_list] anova.insert(0, '', index_list) anova_df = pandasDF2MD(anova) p_value = anova["""PR(>F)"""][0] residual = model.resid sns.distplot(residual) distplot = plt2MD(plt) plt.clf() sm.qqplot(residual, line='s') qqplot = plt2MD(plt) plt.clf() rb.addMD( strip_margin(""" | ## {response_col} by {factor_col} | {fig_box} | | ### ANOVA | {anova_df} | | ### Diagnostics | {distplot} | | {qqplot} """.format(response_col=response_col, factor_col=factor_col, fig_box=fig_box, anova_df=anova_df, distplot=distplot, qqplot=qqplot))) result['_grouped_data'][response_col]['p_value'] = p_value result['_repr_brtc_'] = rb.get() return {'result': result}
# # <b>Reject $H_0$ (reject the null hypothesis)</b> # # so there is a significance difference between the proportion of smokers in different genders # #### d. Is the distribution of bmi across women with no children, one child and two children, the same ? # In[223]: filter1 = data[data["sex"] == "female"] filter2 = filter1[data["children"].cat.codes < 3] filter2["children"] = pd.to_numeric(filter2["children"]) filter2["children"] = pd.Categorical(filter2["children"]) from statsmodels.formula.api import ols from statsmodels.stats.anova import anova_lm formula = "bmi ~ C(children)" model = ols(formula, filter2).fit() aov_table = anova_lm(model) print(aov_table) # $H_0:$ The bmi is significantly similar for women with 0, 1, 2 children i.e $\mu_1 = \mu_2 = \mu_3$ # # $H_1$: The bmi is not significantly similar for women with 0, 1, 2 children i.e $\mu_1 \neq \mu_2 \neq \mu_3$ # # The anova value 0.715 is greater than 0.05 so this confirms that # # <b>Accept $H_0$ (accept the null hypothesis)</b> # # The the distribution of bmi across women with 0, 1, 2 children is similar # i.e $\mu_1 = \mu_2 = \mu_3$
least_error = np.finfo(np.float64).max featurewithLeastSimple = "" #Scale all data between 0 and 1 df3 = input_train[Columnlist] tempcolumnnames = df3.columns x_scaled = min_max_scaler.fit_transform(df3) df3 = pd.DataFrame(x_scaled) df3.columns = tempcolumnnames #Simple regression for col in Columnlist: x = input_train[col] Linearmodel = ols("y ~ x", x).fit() offset, coef = Linearmodel._results.params anova_results = anova_lm(Linearmodel) if (least_error > anova_results['mean_sq'][1]): least_error = anova_results['mean_sq'][1] featurewithLeastSimple = col print('\nANOVA results') print(anova_results) plt.plot(x, x * coef + offset) plt.xlabel('x') plt.ylabel('y') plt.show() print('Best Feature was: ' + featurewithLeastSimple) #multiple feature regression multipleReg = LinearRegression() multipleReg.fit(df3, y) print('Weights are : ')
def Correlations_Cont_Cat(p_data, p_predictors, p_numeric_cat_index=np.array([]), p_weight=None, p_p_val=0.01, p_subsamplesize=100, p_seed=0): """ Use ANOVA to find categorical - continuous relationships. Small differences come through as significant with a high number of observations, therefore we use a sample size of 100 Also keep in mind that by using ANOVA we assume normally distributed data and equal variances an alternative is to use Kruskal - Wallis """ """ Use ICC to define correlations, give box-plots for highly correlated pairs """ from statsmodels.formula.api import ols from statsmodels.stats.anova import anova_lm cont_index = np.intersect1d(p_predictors, ContCatSplit(p_data, p_numeric_cat_index)[0]) cat_index = np.intersect1d(p_predictors, ContCatSplit(p_data, p_numeric_cat_index)[1]) # List of pairs along with correlation above threshold cont_cat_corr_list = [] from random import sample, seed seed(p_seed) rand_vals = sample(range(len(p_data)), k=p_subsamplesize) # Search for the highly correlated pairs for i in cont_index: for j in cat_index: formula = p_data.columns[i] + " ~ " + p_data.columns[j] model_fit = ols(formula, data=p_data.iloc[rand_vals, :]).fit() anova_model = anova_lm(model_fit) p = anova_model.iloc[0, 4] if p < p_p_val: cont_cat_corr_list.append( [p, i, j]) #store correlation and columns index # Order variables by level of correlation s_cont_cat_corr_list = sorted(cont_cat_corr_list, key=lambda x: abs(x[0])) cont_cat_corr_features = [] # Print correlations and column names print('One-way ANOVA p-values - Predictors') for v, i, j in s_cont_cat_corr_list: cont_cat_corr_features.append( [p_data.columns[i], p_data.columns[j], v]) print('{} and {} = {:.2}'.format(p_data.columns[i], p_data.columns[j], v)) # Box plot of the highly correlated pairs for v, i, j in s_cont_cat_corr_list: fg, ax = plt.subplots(figsize=(12, 8)) fg = p_data.boxplot(p_data.columns[i], p_data.columns[j], ax=ax, grid=False) plt.xticks(rotation=90) plt.show() return cont_cat_corr_features
def neuron_period_activity_analysis( hp, log, trial_list, model_dir, rule, seltive_epoch, analy_epoch, n_types=('exh_neurons', 'mix_neurons'), norm=True, PSTH_log=None, last_step=True, bin_wid=0.5, ): print("Start neuron period activity analysis") print( "\trule: " + rule + " selective epoch: " + seltive_epoch + " analysis epoch: ", analy_epoch) with open(model_dir + '/task_info.pkl', 'rb') as tinf: task_info = pickle.load(tinf) if PSTH_log is None: PSTH_log = gen_PSTH_log(hp, trial_list, model_dir, rule, seltive_epoch, n_types=n_types, norm=norm) if isinstance(analy_epoch, str): start = task_info[rule]['epoch_info'][analy_epoch][0] end = task_info[rule]['epoch_info'][analy_epoch][1] elif isinstance(analy_epoch, (tuple, list)): start = int(analy_epoch[0] / hp["dt"]) end = int(analy_epoch[1] / hp["dt"]) else: raise ValueError('Wrong analy_epoch format!') is_dict = False is_list = False if isinstance(trial_list, dict): temp_list = list() is_dict = True for value in trial_list[rule].values(): temp_list += value temp_list = sorted(set(temp_list)) elif isinstance(trial_list, list): temp_list = trial_list is_list = True trial_sort_by_matur = dict() fire_rate_dict = dict() for trial_num in temp_list: growth = log['perf_' + rule][trial_num // log['trials'][1]] if (is_list and growth > hp['mid_target_perf']) or ( is_dict and trial_num in trial_list[rule]['mature']): mature = 'mature' elif (is_list and growth > hp['early_target_perf']) or ( is_dict and trial_num in trial_list[rule]['mid']): mature = 'mid' elif is_list or (is_dict and trial_num in trial_list[rule]['early']): mature = 'early' if mature not in trial_sort_by_matur: trial_sort_by_matur[mature] = list() fire_rate_dict[mature] = list() trial_sort_by_matur[mature].append(trial_num) if last_step: for key, value in trial_sort_by_matur.items(): trial_sort_by_matur[key] = value[-1:] for mature_key, sub_trial_list in trial_sort_by_matur.items(): for trial_num in sub_trial_list: fire_rate_dict[mature_key] += list( PSTH_log[trial_num][:, start:end].mean(axis=1)) #ANOVA# #f,p = stats.f_oneway(*list(fire_rate_dict.values())) dict_melt = dict() dict_melt['Maturation'] = list() dict_melt['Fire_rate'] = list() for key, value in fire_rate_dict.items(): dict_melt['Maturation'] += [key for i in range(len(value))] dict_melt['Fire_rate'] += list(value) df_melt = pd.DataFrame(dict_melt) model = ols('Fire_rate~C(Maturation)', data=df_melt).fit() anova_table = anova_lm(model, typ=2) p = anova_table['PR(>F)'][0] df_g = anova_table['df'][0] df_res = anova_table['df'][1] #print("\tP value:",anova_table['PR(>F)'][0]) # plot # colors = {'early': 'green', 'mid': 'blue', 'mature': 'red'} save_path = 'figure/figure_' + model_dir.rstrip('/').split('/')[ -1] + '/' + rule + '/' + seltive_epoch + '/' + '_'.join(n_types) + '/' fig, axes = plt.subplots(2, 1, figsize=(12, 15)) for mature, fire_rate in fire_rate_dict.items(): axes[0].hist(fire_rate,bins=int(max(fire_rate)/bin_wid)+1,histtype="step",alpha=0.6,\ color=colors[mature],label=mature+' mean:%.3f'%(np.mean(fire_rate)),density=1) axes[0].legend() axes[0].set_xlabel("activity") m_keys = list(fire_rate_dict.keys()) axes[1].boxplot([fire_rate_dict[m_key] for m_key in m_keys], labels=m_keys) axes[1].set_ylabel("activity") fig.suptitle("rule: "+rule+" selective epoch: "+seltive_epoch+" analysis epoch: "+str(analy_epoch)+\ "\n p value: %.3e"%(p)+" group df: %.1f"%(df_g)+" residual df: %.1f"%(df_res)) if isinstance(analy_epoch, str): plt.savefig(save_path + rule + '_' + analy_epoch + '_activity_oneway_anova_analysis.png') plt.savefig(save_path + rule + '_' + analy_epoch + '_activity_oneway_anova_analysis.pdf') elif isinstance(analy_epoch, (tuple, list)): plt.savefig(save_path + rule + '_' + str(analy_epoch[0]) + '_' + str(analy_epoch[1]) + '_activity_oneway_anova_analysis.png') plt.savefig(save_path + rule + '_' + str(analy_epoch[0]) + '_' + str(analy_epoch[1]) + '_activity_oneway_anova_analysis.pdf')
, "FilterImg": FilterImg , "ResultImg": ResultImg } ) print(degf) ## Apply regression formula and draw result in 3d graphs Reg4 = ols(formula="ResultImg ~ NoisedImg + FilterImg", NoisedImg=degf) Fit4 = Reg4.fit() print(Fit4.summary()) print(Fit4.params) print(Fit4.fittedvalues) print(Fit4.resid) print(Fit4.bse) print(Fit4.centered_tss) print(anova_lm(Fit4)) fg = plt.figure() ax = fg.add_subplot(111, projection="3d") ax.scatter( degf["NoisedImg"] , degf["FilterImg"] , degf["ResultImg"] , color="Blue" , marker="+" , alpha=0.5 ) ##Draw the axis for all values with alpha values changes ax.set_xlabel("NoisedImg") ax.set_ylabel("FilterImg") ax.set_zlabel("ResultImg") x_surf = numpy.arange(110, 700, 40)
continue genedata = pd.DataFrame({"expr": expr_vals, "SNP": snp_genotypes, "STR": str_genotypes}) genedata = genedata[~np.isnan(genedata["STR"]) & ~np.isnan(genedata["SNP"]) & ~np.isnan(genedata["expr"])] # Remove outlier STR genotypes gtcounts = genedata.groupby("STR", as_index=False).agg({"SNP": len}) keepgt = set(gtcounts[gtcounts["SNP"]>=args.mingt]["STR"]) genedata = genedata[genedata["STR"].apply(lambda x: x in keepgt)] # print("STR r: %s"%scipy.stats.pearsonr(genedata["STR"], genedata["expr"])[0]) # print("SNP r: %s"%scipy.stats.pearsonr(genedata["SNP"], genedata["expr"])[0]) # print(genedata.groupby("STR", as_index=False).agg({"SNP": len})) # Normalize genedata["STR"] = ZNorm(genedata["STR"]) genedata["SNP"] = ZNorm(genedata["SNP"]) genedata["expr"] = ZNorm(genedata["expr"]) formula_snpstr = "expr ~ STR+SNP" formula_snp = "expr ~ SNP" try: lm_snpstr = ols(formula_snpstr, genedata).fit() except: PROGRESS("Error running snpstr model for gene: %s"%gene) continue try: lm_snp = ols(formula_snp, genedata).fit() except: PROGRESS("Error running SNP only model for gene: %s"%gene) continue anova_results = anova_lm(lm_snp, lm_snpstr) pval = anova_results["Pr(>F)"].values[1] outitems = [gene, args.chrom+":"+str(str_pos), args.chrom+":"+str(snp_pos), pval] outf.write("\t".join([str(item) for item in outitems])+"\n")
data = pd.read_table(DEMO_DATA_ROOT + "/gries_sflwr/_inputfiles/04-1-2-3_accjudg.csv") data.head() from statsmodels.sandbox.stats.runs import mcnemar crosstab = pd.crosstab(data['BEFORE'],data['AFTER']) x2, p = mcnemar(crosstab, correction=False) print('Chi-square=%1.2f, p = %1.2f'%(x2, p)) ## Independent *t*-test vowels = pd.read_table(DEMO_DATA_ROOT + "/gries_sflwr/_inputfiles/04-3-2-1_f1-freq.csv") vowels.head() t, p = stats.ttest_ind(vowels[vowels['SEX']=='M']['HZ_F1'], vowels[vowels['SEX']=='F']['HZ_F1']) print("t-score=%1.2f, p=%1.2f"%(t,p)) ## One-way ANOVA data = pd.read_table(DEMO_DATA_ROOT + "/gries_sflwr/_inputfiles/05-2_reactiontimes.csv") data data = data.dropna() from statsmodels.formula.api import ols from statsmodels.stats.anova import anova_lm model = ols('RT ~ FAMILIARITY', data).fit() aov = anova_lm(model) print(aov)
]] display(frame1, frame2) # %% frame = pd.concat([frame1, frame2]) frame.index = range(len(frame)) frame['cv'] = frame['cv'].map(lambda e: str(e)) display(frame) # %% for value in ['f1score', 'recall', 'precision']: print('-' * 80) print(value) formula = f'{value} ~ subject + method + cv' model = ols(formula, data=frame).fit() anova = anova_lm(model) anova.to_html(f'anova_{value}.html') display(anova) # %% newframe = frame.groupby(['method', 'subject']).mean() newframe.pop('f1score') display(newframe) plt.style.use('ggplot') fig, axes = plt.subplots(1, 3, figsize=(12, 4)) for j, value in enumerate(['recall', 'precision', 'f1-score']): ax = axes[j] ax.boxplot([newframe.loc['raw'][value], newframe.loc['manifolder'][value]],
import os import pandas as pd import statsmodels.formula.api as smf from statsmodels.stats.anova import anova_lm from scipy.stats import shapiro from variables import DIR_OUT, DIR_JAM import numpy as np if __name__ == "__main__": df = pd.read_csv( os.path.join(DIR_OUT, "derived_tables", "nb_streamlines_hemi_level.csv")) df["PP_CS_Depth_Normalised"] = df["PP_CS_Depth"] / df["Max_Geo_Depth"] # df['Roi_Area_Normalised'] # df.to_csv(os.path.join(DIR_OUT,'derived_tables','nb_streamlines_hemi_level_norm.csv'), index=False) # model = smf.ols('Nb_Streamlines_Hemi ~ PP_CS_Depth_Normalised + C(Hemisphere) + C(HandednessQ) + Roi_Area',data=df).fit() model = smf.ols("PP_CS_Depth_Normalised ~ C(Hemisphere)", data=df).fit() resid = model.resid W, p = shapiro(resid) print W, p summary = model.summary() print summary anova = anova_lm(model, type=3) print anova
from statsmodels.stats.anova import anova_lm from pandas import Series, DataFrame import numpy as np import statsmodels.api as sm from statsmodels.stats.multicomp import pairwise_tukeyhsd hs = [1.00, 2.00, 3.00, 4.00, 5.00, 6.00, 7.00, 8.00, 9.00, 2.00] fetus = [1.00, 2.00, 1.00, 2.00, 1.00, 2.00, 2.00, 2.00, 1.00, 1.00] observer = [1.00, 2.00, 3.00, 2.00, 1.00, 2.00, 4.00, 4.00, 1.00, 1.00] data = {'value': value, 'group': group} df = DataFrame(data) # print frame import json formula = 'hs ~ fetus' anova_results = anova_lm(ols(formula, df).fit()) print anova_results formula = 'hs~C(fetus)+C(observer)+C(fetus):C(observer)' anova_results = anova_lm(ols(formula, df).fit()) print anova_results hsd = pairwise_tukeyhsd(hs, fetus) print hsd.summary() valsDict_rs = {} # rowKey = colKey, codeID = cols[5], organID = cols[1], examID = cols[6], examTime = cols[2].replace("D", "") valsDict_rs['rowKey'] = "rowKey6" valsDict_rs['codeID'] = "codeID4" valsDict_rs['organID'] = "organID3"
plt.savefig(PATH + 'grid_of_age.png', dpi=300) plt.close() # ANOVA X1 = PolynomialFeatures(degree=1).fit_transform(wage[['age']]) X2 = PolynomialFeatures(degree=2).fit_transform(wage[['age']]) X3 = PolynomialFeatures(degree=3).fit_transform(wage[['age']]) X5 = PolynomialFeatures(degree=5).fit_transform(wage[['age']]) poly1 = sm.GLS(wage['wage'], X1).fit() poly2 = sm.GLS(wage['wage'], X2).fit() poly3 = sm.GLS(wage['wage'], X3).fit() poly5 = sm.GLS(wage['wage'], X5).fit() # ANOVA, as in chpater 3 notebook with warnings.catch_warnings(): warnings.filterwarnings("ignore") ## Supress warnings print(anova_lm(poly1, poly2, poly3, poly4, poly5)) # Polynomial regression of degree 5 on orthogonalized X. Refer to chapter 3 notebook. X5_ortho = ortho_poly_fit(wage[['age']], degree=5)[0] X5_ortho[:, 0] = 1 # Replace constant column with 1s for Intercept estimation. poly5_ortho = sm.GLS(wage['wage'], X5_ortho).fit() print(poly5_ortho.summary()) # Create binary qualitative response y_clf = (wage.wage > 250).map({False: 0, True: 1}) # Logistic regression logreg = sm.GLM(y_clf, X4, family=sm.families.Binomial()).fit() print(logreg.summary()) # Predict on age grid y_pred_clf = logreg.predict(X_test)
from statsmodels.stats.anova import anova_lm from statsmodels.stats.multicomp import pairwise_tukeyhsd #单因素方差分析 data = pd.read_excel(unicode(r'C:\Users\mime\Desktop\统计学学习数据.xlsx', 'utf-8'), sheetname=0) list_value = [] list_variable = [] for i in arange(len(data.columns)): x = data.iloc[:, i] for value in x: list_value.append(value) list_variable.append(data.iloc[:, i].name) data = pd.DataFrame([list_variable, list_value], index=['indestry', 'Y']).T formula = 'Y ~ C(indestry)' anova_results = anova_lm(ols(formula, data).fit()) mean_data = data.mean(axis=1) k = len(data.index) n = (data.count(axis=1)).sum() mean_all = ((data.sum(axis=1)).sum()) / ((data.count(axis=1)).sum()) SST = (((data - mean_all)**2).sum()).sum() SSA = float((((mean_data - mean_all)**2).mul(data.count(axis=1), axis=0)).sum()) SSE = ((data.sub(mean_data, axis=0)**2).sum(axis=1)).sum() F = (SSA / (k - 1)) / (SSE / (n - k)) F_pval = st.f.cdf(F, k - 1, n - k) F_alpha = st.f.ppf(1 - 0.05, k - 1, n - k) R_square = SSA / SST R = mt.sqrt(SSA / SST)
#%% sns.boxplot(x="Typ", y="Druckfestigkeit", data=df) plt.xlabel("Typ") plt.ylabel("Druckfestigkeit") plt.show() #%% b) fit = ols("Druckfestigkeit~Typ", data=df).fit() fit.params #%% c) # H_0 = mu_1 = mu_2 = mu_3 = mu_4 from statsmodels.stats.anova import anova_lm anova_lm(fit) #%% 10.2 from pandas import DataFrame import pandas as pd import numpy as np import seaborn as sns import matplotlib.pyplot as plt import scipy.stats as st df = DataFrame({ "Behandlung": np.repeat(["A", "B", "C", "D"], [4, 6, 6, 8]), "Koagulationszeit": [ 62, 60, 63, 59, 63, 67, 71, 64, 65, 66, 68, 66, 71, 67, 68, 68, 56, 62, 60, 61, 63, 64, 63, 59
def two_way_anova(pdf, var_name, grouping_names): """Two-way ANOVA Arguments: pdf (pd dataframe) var_name (str): grouping_names (list of str): """ # TODO extend it to multi-way ANOVA text_result = '' # http://statsmodels.sourceforge.net/stable/examples/generated/example_interactions.html#one-way-anova from statsmodels.formula.api import ols from statsmodels.stats.anova import anova_lm data = pdf.dropna(subset=[var_name] + grouping_names) # from IPython import embed; embed() # FIXME If there is a variable called 'C', then patsy is confused whether C is the variable or the categorical variable # http://gotoanswer.stanford.edu/?q=Statsmodels+Categorical+Data+from+Formula+%28using+pandas% # http://stackoverflow.com/questions/22545242/statsmodels-categorical-data-from-formula-using-pandas # http://stackoverflow.com/questions/26214409/ipython-notebook-and-patsy-categorical-variable-formula anova_model = ols(str('%s ~ C(%s) + C(%s) + C(%s):C(%s)' % (var_name, grouping_names[0], grouping_names[1], grouping_names[0], grouping_names[1])), data=data).fit() anova_result = anova_lm(anova_model, typ=3) text_result += _('Result of two-way ANOVA:' + '\n') # Main effects for group_i, group in enumerate(grouping_names): text_result += _('Main effect of %s: ' % group) + '<i>F</i>(%d, %d) = %0.3g, %s\n' % \ (anova_result['df'][group_i+1], anova_result['df'][4], anova_result['F'][group_i+1], cs_util.print_p(anova_result['PR(>F)'][group_i+1])) # Interaction effects text_result += _('Interaction of %s and %s: ') % (grouping_names[0], grouping_names[1]) + '<i>F</i>(%d, %d) = %0.3g, %s\n' % \ (anova_result['df'][3], anova_result['df'][4], anova_result['F'][3], cs_util.print_p(anova_result['PR(>F)'][3])) """ # TODO # http://en.wikipedia.org/wiki/Effect_size#Omega-squared.2C_.CF.892 omega2 = (anova_result['sum_sq'][0] - (anova_result['df'][0] * anova_result['mean_sq'][1])) / ( (anova_result['sum_sq'][0] + anova_result['sum_sq'][1]) + anova_result['mean_sq'][1]) text_result += _('Effect size: ') + 'ω<sup>2</sup> = %0.3g\n' % omega2 """ """ # TODO # http://statsmodels.sourceforge.net/stable/stats.html#multiple-tests-and-multiple-comparison-procedures if anova_result['PR(>F)'][0] < 0.05: # post-hoc post_hoc_res = sm.stats.multicomp.pairwise_tukeyhsd(np.array(data[var_name]), np.array(data[grouping_name]), alpha=0.05) text_result += '\n' + _(u'Groups differ. Post-hoc test of the means.') + '\n' text_result += ('<fix_width_font>%s\n<default>' % post_hoc_res).replace(' ', u'\\u00a0') ''' # TODO create our own output http://statsmodels.sourceforge.net/devel/generated/statsmodels.sandbox.stats.multicomp.TukeyHSDResults.html#statsmodels.sandbox.stats.multicomp.TukeyHSDResults These are the original data: post_hoc_res.data post_hoc_res.groups These are used for the current output: post_hoc_res.groupsunique post_hoc_res.meandiffs post_hoc_res.confint post_hoc_res.reject ''' """ return text_result
# 그룹 간 데이터 들의 분포를 시각화 #plot_data = [gr1,gr2,gr3] #plt.boxplot(plot_data) #plt.show() f_statistic, p_val = stats.f_oneway(gr1,gr2,gr3) print('일원분산분석 결과 : f_statistic:%f , p_val:%f'%(f_statistic,p_val)) # 일원분산분석 결과 : f_statistic:3.711336 , p_val:0.043589 <0.05 이므로 귀무기각 # 그룹별 (3개) 시험점수는 차이가 있다 라는 의견이 통계적으로 유의하다 #일원분산분석 방법2 - Linear Model 을 속성으로 사용 df = pd.DataFrame(data, columns = ['value','group']) #print(df) lmodel = ols('value ~ C(group)', df).fit() # C(그룹칼럼..) : 범주형임을 명시적으로 표시 PR(>F)=p-value 0.043589 print(anova_lm(lmodel)) #이원분산분석 : 집단 구분 요인2 url = 'https://raw.githubusercontent.com/pykwon/python/master/testdata_utf8/group3_2.txt' data = pd.read_csv(url) print(data.head(3)) print(data.tail(3)) #귀무 : 관측자와 태아수 그룹에 따라 태아의 머리둘레에 차이가 없다. #대립 : 관측자와 태아수 그룹에 따라 태아의 머리둘레에 차이가 있다. # 시각화 plt.rc('font', family = 'malgun gothic') data.boxplot(column = '머리둘레' , by='태아수' , grid = True) #plt.show() # 태아의 머리둘레는 차이가 있어 보임 . 관측자와 상호 작용이 있는지 분산분석으로 검정
#axes[0].yaxis.tick_right() for ax in axes.flat: ax.margins(0.00) ax.grid(True) fig.tight_layout(rect=[0, 0.01, 1, 0.97]) fig.subplots_adjust(wspace=0.0) title = '{0} Conv2D Layer {1}'.format(name, layer_id + 1) plt.suptitle(title, x=0.55, y=1.0) if not os.path.exists('./feat_plots'): os.mkdir('feat_plots') plt.savefig('./feat_plots/{0}.png'.format(title)) #fig.savefig("foo.pdf", bbox_inches='tight') plt.clf() anova_all = np.array(anova_all) df = pd.DataFrame(data=anova_all[1:, 1:], index=anova_all[1:, 0].tolist(), columns=anova_all[0, 1:].tolist()) df.colums = ['id', 'y', 'layer_id', 'is_sparse'] df = df.astype({'y': 'float32', 'layer_id': 'int32', 'is_sparse': 'int32'}) formula = 'y ~ C(layer_id) + C(is_sparse) + C(layer_id)*C(is_sparse)' model = ols(formula, df).fit() aov_table = anova_lm(model, typ=1) eta_squared(aov_table) omega_squared(aov_table) print(aov_table)
resid[group.index], marker=symbols[j], color=colors[i - 1], s=144, edgecolors='black') plt.xlabel('Group') #@savefig residual_groups.png align=center plt.ylabel('Residuals') # now we will test some interactions using anova or f_test interX_lm = ols("S ~ C(E) * X + C(M)", salary_table).fit() print interX_lm.summary() # Do an ANOVA check table1 = anova_lm(lm, interX_lm) print table1 interM_lm = ols("S ~ X + C(E)*C(M)", data=salary_table).fit() print interM_lm.summary() table2 = anova_lm(lm, interM_lm) print table2 # The design matrix as a DataFrame interM_lm.model.data.orig_exog # The design matrix as an ndarray interM_lm.model.exog interM_lm.model.exog_names infl = interM_lm.get_influence()
# 3.levene의 등분산 검정 (W, p) = stats.levene(group1, group2, group3) if p < 0.05: print('Warning: the P-value of the Levene test is <0.05: p=', p) # 4.One-way ANOVA ## 4.1. F_statistic, pVal = stats.f_oneway(group1, group2, group3) print((F_statistic, pVal)) if pVal < 0.05: print('One of the groups is significantly different.') ## 4.2. model = ols('value ~ C(treatment)', data).fit() anovaResults = anova_lm(model) print(anovaResults) ## 4.3.두 경우 결과 비교(같으면 OUTPUT 없음) np.testing.assert_almost_equal(F_statistic, anovaResults['F'][0]) """ 사후검정 """ # 1.library 선언 import matplotlib.pyplot as plt from statsmodels.stats.multicomp import pairwise_tukeyhsd # 2.데이터 입력 plot_data = [group1, group2, group3] df = pd.DataFrame(data, columns=["value", "treatment"]) # 3.Tukey 사후검정
data_frame["X1"] , data_frame["X2"] , data_frame["Y"] , color="blue" , marker="o" , alpha=1 ) # From here....graph code is taken from class mate "Arsalan Ali" Reg = ols(formula="Y ~ X! + X2", data=data_frame) Fit2 = Reg.fit() print("\n", Fit2.summary()) print("\n", anova_lm(Fit2)) # Again plotting our Dear 3D-Graph ax = plt.figure().gca(projection='3d') # Creating Axis X,Y,Z out of our Data Y,X1,X2 ax.scatter( data_frame["X1"] , data_frame["X2"] , data_frame["y"] , color="blue" , marker="o" , alpha=1 ) # Title of the graph
随机性:样本是随机采样但 独立性:来自不同组但样本是相互独立但 正太分布性:组内样本都来自一个正太分布 方差齐性:不同组但方差相等或相近 """ # 读取数据, d1 对应于算法 a,d2 对应于算法 b df = pd.read_csv("./oneway.csv") d1 = df[df['algo'] == 'a']['ratio'] d2 = df[df['algo'] == 'b']['ratio'] # 检验两个水平的正态性 print('---------------- 检验两个水平的正态性 ----------------') print(ss.normaltest(d1)) print(ss.normaltest(d2)) # 检测两个水平的方差齐性 print('---------------- 检测两个水平的方差齐性 ----------------') args = [d1, d2] print(ss.levene(*args)) # F 检验的第一种方法 print('---------------- F 检验的第一种方法 ----------------') print(ss.f_oneway(*args)) # F 检验的第二种方法 print('---------------- F 检验的第二种方法 ----------------') model = ols('ratio ~ algo', df).fit() anovat = anova_lm(model) print(anovat)