def paired_data(): """Analysis of paired data Compare mean daily intake over 10 pre-menstrual and 10 post-menstrual days (in kJ).""" # Get the data: daily intake of energy in kJ for 11 women data = getData("altman_93.txt", subDir=r"..\Data\data_altman") np.mean(data, axis=0) np.std(data, axis=0, ddof=1) pre = data[:, 0] post = data[:, 1] # --- >>> START stats <<< --- # paired t-test: doing two measurments on the same experimental unit # e.g., before and after a treatment t_statistic, p_value = stats.ttest_1samp(post - pre, 0) # p < 0.05 => alternative hypothesis: # the difference in mean is not equal to 0 print(("paired t-test", p_value)) # alternative to paired t-test when data has an ordinary scale or when not # normally distributed rankSum, p_value = stats.wilcoxon(post - pre) # --- >>> STOP stats <<< --- print(("Wilcoxon-Signed-Rank-Sum test", p_value)) return p_value # should be 0.0033300139117459797
def correlation(): '''Pearson correlation, and two types of rank correlation (Spearman, Kendall) comparing age and %fat (measured by dual-photon absorptiometry) for 18 normal adults. ''' # Get the data data = getData('altman_11_1.txt', subDir='..\Data\data_altman') x = data[:,0] y = data[:,1] # --- >>> START stats <<< --- # Calculate correlations # Resulting correlation values are stored in a dictionary, so that it is # obvious which value belongs to which correlation coefficient. corr = {} corr['pearson'], _ = stats.pearsonr(x,y) corr['spearman'], _ = stats.spearmanr(x,y) corr['kendall'], _ = stats.kendalltau(x,y) # --- >>> STOP stats <<< --- print(corr) # Assert that Spearman's rho is just the correlation of the ranksorted data np.testing.assert_almost_equal(corr['spearman'], stats.pearsonr(stats.rankdata(x), stats.rankdata(y))[0]) return corr['pearson'] # should be 0.79208623217849117
def anova_byHand(): """ Calculate the ANOVA by hand. While you would normally not do that, this function shows how the underlying values can be calculated. """ # Get the data data = getData('altman_910.txt', subDir='..\Data\data_altman') # Convert them to pandas-forman and group them by their group value df = pd.DataFrame(data, columns=['values', 'group']) groups = df.groupby('group') # The "total sum-square" is the squared deviation from the mean ss_total = np.sum((df['values']-df['values'].mean())**2) # Calculate ss_treatment and ss_error (ss_treatments, ss_error) = (0, 0) for val, group in groups: ss_error += sum((group['values'] - group['values'].mean())**2) ss_treatments += len(group) * (group['values'].mean() - df['values'].mean())**2 df_groups = len(groups)-1 df_residuals = len(data)-len(groups) F = (ss_treatments/df_groups) / (ss_error/df_residuals) df = stats.f(df_groups,df_residuals) p = df.sf(F) print(('ANOVA-Results: F = {0}, and p<{1}'.format(F, p))) return (F, p)
def unpaired_data(): """ Then some unpaired comparison: 24 hour total energy expenditure (MJ/day), in groups of lean and obese women""" # Get the data: energy expenditure in mJ and stature (0=obese, 1=lean) energ = getData("altman_94.txt", subDir=r"..\Data\data_altman") # Group them group1 = energ[:, 1] == 0 group1 = energ[group1][:, 0] group2 = energ[:, 1] == 1 group2 = energ[group2][:, 0] np.mean(group1) np.mean(group2) # --- >>> START stats <<< --- # two-sample t-test # null hypothesis: the two groups have the same mean # this test assumes the two groups have the same variance... # (can be checked with tests for equal variance) # independent groups: e.g., how boys and girls fare at an exam # dependent groups: e.g., how the same class fare at 2 different exams t_statistic, p_value = stats.ttest_ind(group1, group2) # p_value < 0.05 => alternative hypothesis: # they don't have the same mean at the 5% significance level print(("two-sample t-test", p_value)) # For non-normally distributed data, perform the two-sample wilcoxon test # a.k.a Mann Whitney U u, p_value = stats.mannwhitneyu(group1, group2) print(("Mann-Whitney test", p_value)) # --- >>> STOP stats <<< --- # Plot the data plt.plot(group1, "bx", label="obese") plt.hold(True) plt.plot(group2, "ro", label="lean") plt.legend(loc=0) plt.show() # The same calculations, but implemented with pandas, would be: # import pandas as pd # df = pd.DataFrame(energ, columns = ['energy', 'weightClass']) # grouped = df.groupby('weightClass') # grouped.mean() # t_statistic, p_value = stats.ttest_ind(grouped.get_group(0).energy, grouped.get_group(1).energy) # grouped.energy.plot(marker='o', lw=0) # plt.legend(['obese', 'lean']) # plt.show() return p_value # should be 0.0010608066929400244
def anova_oneway(): ''' One-way ANOVA: test if results from 3 groups are equal. Twenty-two patients undergoing cardiac bypass surgery were randomized to one of three ventilation groups: Group I: Patients received 50% nitrous oxide and 50% oxygen mixture continuously for 24 h. Group II: Patients received a 50% nitrous oxide and 50% oxygen mixture only dirng the operation. Group III: Patients received no nitrous oxide but received 35-50% oxygen for 24 h. The data show red cell folate levels for the three groups after 24h' ventilation. ''' # Get the data print('One-way ANOVA: -----------------') data = getData('altman_910.txt', subDir='..\Data\data_altman') # Sort them into groups, according to column 1 group1 = data[data[:,1]==1,0] group2 = data[data[:,1]==2,0] group3 = data[data[:,1]==3,0] # --- >>> START stats <<< --- # First, check if the variances are equal, with the "Levene"-test (W,p) = stats.levene(group1, group2, group3) if p<0.05: print(('Warning: the p-value of the Levene test is <0.05: p={0}'.format(p))) # Do the one-way ANOVA F_statistic, pVal = stats.f_oneway(group1, group2, group3) # --- >>> STOP stats <<< --- # Print the results print('Data form Altman 910:') print((F_statistic, pVal)) if pVal < 0.05: print('One of the groups is significantly different.') # Elegant alternative implementation, with pandas & statsmodels df = pd.DataFrame(data, columns=['value', 'treatment']) model = ols('value ~ C(treatment)', df).fit() anovaResults = anova_lm(model) print(anovaResults) # Check if the two results are equal. If they are, there is no output np.testing.assert_almost_equal(F_statistic, anovaResults['F'][0]) return (F_statistic, pVal) # should be (3.711335988266943, 0.043589334959179327)
def regression_line(): '''Fit a line, using the powerful "ordinary least square" method of pandas. Data from 24 type 1 diabetic patients, relating Fasting blood glucose (mmol/l) to mean circumferential shortening velocity (%/sec), derived form echocardiography . ''' # Get the data data = getData('altman_11_6.txt', subDir=r'..\Data\data_altman') df = pd.DataFrame(data, columns=['glucose', 'Vcf']) # --- >>> START stats <<< --- model = pd.ols(y=df['Vcf'], x=df['glucose']) print((model.summary)) # --- >>> STOP stats <<< --- return model.f_stat['f-stat'] # should be 4.4140184331462571
def anova_interaction(): """ANOVA with interaction: Measurement of fetal head circumference, by four observers in three fetuses, from a study investigating the reproducibility of ultrasonic fetal head circumference data. """ # Get the data data = getData("altman_12_6.txt", subDir="..\Data\data_altman") # Bring them in DataFrame-format df = pd.DataFrame(data, columns=["hs", "fetus", "observer"]) # --- >>> START stats <<< --- # Determine the ANOVA with interaction formula = "hs ~ C(fetus) + C(observer) + C(fetus):C(observer)" lm = ols(formula, df).fit() anovaResults = anova_lm(lm) # --- >>> STOP stats <<< --- print(anovaResults) return anovaResults["F"][0]
def anova_interaction(): '''ANOVA with interaction: Measurement of fetal head circumference, by four observers in three fetuses, from a study investigating the reproducibility of ultrasonic fetal head circumference data.''' # Get the data data = getData('altman_12_6.txt', subDir='..\Data\data_altman') # Bring them in dataframe-format df = pd.DataFrame(data, columns=['hs', 'fetus', 'observer']) # --- >>> START stats <<< --- # Determine the ANOVA with interaction # [xxx] formula = 'hs ~ C(fetus) + C(observer) + C(fetus):C(observer)' lm = ols(formula, df).fit() anovaResults = anova_lm(lm) # --- >>> STOP stats <<< --- print(anovaResults) return anovaResults['F'][0]
def check_mean(): '''Data from Altman, check for significance of mean value. Compare average daily energy intake (kJ) over 10 days of 11 healthy women, and compare it to the recommended level of 7725 kJ. ''' # Get data from Altman data = getData('altman_91.txt', subDir='..\Data\data_altman') # Watch out: by default the standard deviation in numpy is calculated with ddof=0, corresponding to 1/N! myMean = np.mean(data) mySD = np.std(data, ddof=1) # sample standard deviation print(('Mean and SD: {0:4.2f} and {1:4.2f}'.format(myMean, mySD))) # Confidence intervals tf = stats.t(len(data)-1) # multiplication with np.array[-1,1] is a neat trick to implement "+/-" ci = np.mean(data) + stats.sem(data)*np.array([-1,1])*tf.ppf(0.975) print(('The confidence intervals are {0:4.2f} to {1:4.2f}.'.format(ci[0], ci[1]))) # Check if there is a significant difference relative to "checkValue" checkValue = 7725 # --- >>> START stats <<< --- t, prob = stats.ttest_1samp(data, checkValue) if prob < 0.05: print(('{0:4.2f} is significantly different from the mean (p={1:5.3f}).'.format(checkValue, prob))) # For not normally distributed data, use the Wilcoxon signed rank test (rank, pVal) = stats.wilcoxon(data-checkValue) if pVal < 0.05: issignificant = 'unlikely' else: issignificant = 'likely' # --- >>> STOP stats <<< --- print(('It is ' + issignificant + ' that the value is {0:d}'.format(checkValue))) return prob # should be 0.018137235176105802
def test_getdata(self): data = getData('altman_93.txt', subDir='../Data/data_altman') self.assertEqual(data[0][0], 5260)