def paired_data():
    """Analysis of paired data
    Compare mean daily intake over 10 pre-menstrual and 10 post-menstrual days (in kJ)."""

    # Get the data:  daily intake of energy in kJ for 11 women
    data = getData("altman_93.txt", subDir=r"..\Data\data_altman")

    np.mean(data, axis=0)
    np.std(data, axis=0, ddof=1)

    pre = data[:, 0]
    post = data[:, 1]

    # --- >>> START stats <<< ---
    # paired t-test: doing two measurments on the same experimental unit
    # e.g., before and after a treatment
    t_statistic, p_value = stats.ttest_1samp(post - pre, 0)

    # p < 0.05 => alternative hypothesis:
    # the difference in mean is not equal to 0
    print(("paired t-test", p_value))

    # alternative to paired t-test when data has an ordinary scale or when not
    # normally distributed
    rankSum, p_value = stats.wilcoxon(post - pre)
    # --- >>> STOP stats <<< ---
    print(("Wilcoxon-Signed-Rank-Sum test", p_value))

    return p_value  # should be 0.0033300139117459797
def correlation():
    '''Pearson correlation, and two types of rank correlation (Spearman, Kendall)
    comparing age and %fat (measured by dual-photon absorptiometry) for 18 normal adults.
    '''
    
    # Get the data
    data = getData('altman_11_1.txt', subDir='..\Data\data_altman')
    x = data[:,0]
    y = data[:,1]
    
    # --- >>> START stats <<< ---
    # Calculate correlations
    # Resulting correlation values are stored in a dictionary, so that it is
    # obvious which value belongs to which correlation coefficient.
    corr = {}
    corr['pearson'], _ = stats.pearsonr(x,y)
    corr['spearman'], _ = stats.spearmanr(x,y)
    corr['kendall'], _ = stats.kendalltau(x,y)
    # --- >>> STOP stats <<< ---
    
    print(corr)    
    
    # Assert that Spearman's rho is just the correlation of the ranksorted data
    np.testing.assert_almost_equal(corr['spearman'], stats.pearsonr(stats.rankdata(x), stats.rankdata(y))[0])
    
    return corr['pearson']  # should be 0.79208623217849117
def anova_byHand():
    """ Calculate the ANOVA by hand. While you would normally not do that, this function shows
    how the underlying values can be calculated.
    """

     # Get the data
    data = getData('altman_910.txt', subDir='..\Data\data_altman')

    # Convert them to pandas-forman and group them by their group value
    df = pd.DataFrame(data, columns=['values', 'group'])
    groups = df.groupby('group')

    # The "total sum-square" is the squared deviation from the mean
    ss_total = np.sum((df['values']-df['values'].mean())**2)
    
    # Calculate ss_treatment and  ss_error
    (ss_treatments, ss_error) = (0, 0)
    for val, group in groups:
        ss_error += sum((group['values'] - group['values'].mean())**2)
        ss_treatments += len(group) * (group['values'].mean() - df['values'].mean())**2

    df_groups = len(groups)-1
    df_residuals = len(data)-len(groups)
    F = (ss_treatments/df_groups) / (ss_error/df_residuals)
    df = stats.f(df_groups,df_residuals)
    p = df.sf(F)

    print(('ANOVA-Results: F = {0}, and p<{1}'.format(F, p)))
    
    return (F, p)
def unpaired_data():
    """ Then some unpaired comparison: 24 hour total energy expenditure (MJ/day),
    in groups of lean and obese women"""

    # Get the data: energy expenditure in mJ and stature (0=obese, 1=lean)
    energ = getData("altman_94.txt", subDir=r"..\Data\data_altman")

    # Group them
    group1 = energ[:, 1] == 0
    group1 = energ[group1][:, 0]
    group2 = energ[:, 1] == 1
    group2 = energ[group2][:, 0]

    np.mean(group1)
    np.mean(group2)

    # --- >>> START stats <<< ---
    # two-sample t-test
    # null hypothesis: the two groups have the same mean
    # this test assumes the two groups have the same variance...
    # (can be checked with tests for equal variance)
    # independent groups: e.g., how boys and girls fare at an exam
    # dependent groups: e.g., how the same class fare at 2 different exams
    t_statistic, p_value = stats.ttest_ind(group1, group2)

    # p_value < 0.05 => alternative hypothesis:
    # they don't have the same mean at the 5% significance level
    print(("two-sample t-test", p_value))

    # For non-normally distributed data, perform the two-sample wilcoxon test
    # a.k.a Mann Whitney U
    u, p_value = stats.mannwhitneyu(group1, group2)
    print(("Mann-Whitney test", p_value))
    # --- >>> STOP stats <<< ---

    # Plot the data
    plt.plot(group1, "bx", label="obese")
    plt.hold(True)
    plt.plot(group2, "ro", label="lean")
    plt.legend(loc=0)
    plt.show()

    # The same calculations, but implemented with pandas, would be:
    # import pandas as pd
    # df = pd.DataFrame(energ, columns = ['energy', 'weightClass'])
    # grouped = df.groupby('weightClass')
    # grouped.mean()
    # t_statistic, p_value = stats.ttest_ind(grouped.get_group(0).energy, grouped.get_group(1).energy)
    # grouped.energy.plot(marker='o', lw=0)
    # plt.legend(['obese', 'lean'])
    # plt.show()

    return p_value  # should be 0.0010608066929400244
def anova_oneway():
    ''' One-way ANOVA: test if results from 3 groups are equal.
    
    Twenty-two patients undergoing cardiac bypass surgery were randomized to one of three ventilation groups:
    
    Group I: Patients received 50% nitrous oxide and 50% oxygen mixture continuously for 24 h.
    Group II: Patients received a 50% nitrous oxide and 50% oxygen mixture only dirng the operation.
    Group III: Patients received no nitrous oxide but received 35-50% oxygen for 24 h.
    
    The data show red cell folate levels for the three groups after 24h' ventilation.
    
    '''
    
    # Get the data
    print('One-way ANOVA: -----------------')
    data = getData('altman_910.txt', subDir='..\Data\data_altman')
    
    # Sort them into groups, according to column 1
    group1 = data[data[:,1]==1,0]
    group2 = data[data[:,1]==2,0]
    group3 = data[data[:,1]==3,0]
    
    # --- >>> START stats <<< ---
    # First, check if the variances are equal, with the "Levene"-test
    (W,p) = stats.levene(group1, group2, group3)
    if p<0.05:
        print(('Warning: the p-value of the Levene test is <0.05: p={0}'.format(p)))
    
    # Do the one-way ANOVA
    F_statistic, pVal = stats.f_oneway(group1, group2, group3)
    # --- >>> STOP stats <<< ---
    
    # Print the results
    print('Data form Altman 910:')
    print((F_statistic, pVal))
    if pVal < 0.05:
        print('One of the groups is significantly different.')
        
    # Elegant alternative implementation, with pandas & statsmodels
    df = pd.DataFrame(data, columns=['value', 'treatment'])    
    model = ols('value ~ C(treatment)', df).fit()
    anovaResults = anova_lm(model)
    print(anovaResults)
    
    # Check if the two results are equal. If they are, there is no output
    np.testing.assert_almost_equal(F_statistic, anovaResults['F'][0])
    
    return (F_statistic, pVal) # should be (3.711335988266943, 0.043589334959179327)
def regression_line():
    '''Fit a line, using the powerful "ordinary least square" method of pandas.
    
    Data from 24 type 1 diabetic patients, relating Fasting blood glucose (mmol/l) to mean circumferential shortening velocity (%/sec), derived form echocardiography .
    
    '''
    
    # Get the data
    data = getData('altman_11_6.txt', subDir=r'..\Data\data_altman')
    
    df = pd.DataFrame(data, columns=['glucose', 'Vcf'])
    # --- >>> START stats <<< ---
    model = pd.ols(y=df['Vcf'], x=df['glucose'])
    print((model.summary))
    # --- >>> STOP stats <<< ---
    
    return model.f_stat['f-stat'] # should be 4.4140184331462571
def anova_interaction():
    """ANOVA with interaction: Measurement of fetal head circumference,
    by four observers in three fetuses, from a study investigating the
    reproducibility of ultrasonic fetal head circumference data.
    """

    # Get the data
    data = getData("altman_12_6.txt", subDir="..\Data\data_altman")

    # Bring them in DataFrame-format
    df = pd.DataFrame(data, columns=["hs", "fetus", "observer"])

    # --- >>> START stats <<< ---
    # Determine the ANOVA with interaction
    formula = "hs ~ C(fetus) + C(observer) + C(fetus):C(observer)"
    lm = ols(formula, df).fit()
    anovaResults = anova_lm(lm)
    # --- >>> STOP stats <<< ---
    print(anovaResults)

    return anovaResults["F"][0]
def anova_interaction():
    '''ANOVA with interaction: Measurement of fetal head circumference,
    by four observers in three fetuses, from a study investigating the
    reproducibility of ultrasonic fetal head circumference data.'''
    
    # Get the data
    data = getData('altman_12_6.txt', subDir='..\Data\data_altman')
    
    # Bring them in dataframe-format
    df = pd.DataFrame(data, columns=['hs', 'fetus', 'observer'])
    
    # --- >>> START stats <<< ---
    # Determine the ANOVA with interaction
    # [xxx]
    formula = 'hs ~ C(fetus) + C(observer) + C(fetus):C(observer)'
    lm = ols(formula, df).fit()
    anovaResults = anova_lm(lm)
    # --- >>> STOP stats <<< ---
    print(anovaResults)

    return  anovaResults['F'][0]
def check_mean():        
    '''Data from Altman, check for significance of mean value.
    Compare average daily energy intake (kJ) over 10 days of 11 healthy women, and compare it to the recommended level of 7725 kJ.
    '''
    
    # Get data from Altman
    data = getData('altman_91.txt', subDir='..\Data\data_altman')

    # Watch out: by default the standard deviation in numpy is calculated with ddof=0, corresponding to 1/N!
    myMean = np.mean(data)
    mySD = np.std(data, ddof=1)     # sample standard deviation
    print(('Mean and SD: {0:4.2f} and {1:4.2f}'.format(myMean, mySD)))

    # Confidence intervals
    tf = stats.t(len(data)-1)
    # multiplication with np.array[-1,1] is a neat trick to implement "+/-"
    ci = np.mean(data) + stats.sem(data)*np.array([-1,1])*tf.ppf(0.975)
    print(('The confidence intervals are {0:4.2f} to {1:4.2f}.'.format(ci[0], ci[1])))

    # Check if there is a significant difference relative to "checkValue"
    checkValue = 7725
    # --- >>> START stats <<< ---
    t, prob = stats.ttest_1samp(data, checkValue)
    if prob < 0.05:
        print(('{0:4.2f} is significantly different from the mean (p={1:5.3f}).'.format(checkValue, prob)))

    # For not normally distributed data, use the Wilcoxon signed rank test
    (rank, pVal) = stats.wilcoxon(data-checkValue)
    if pVal < 0.05:
      issignificant = 'unlikely'
    else:
      issignificant = 'likely'
    # --- >>> STOP stats <<< ---
      
    print(('It is ' + issignificant + ' that the value is {0:d}'.format(checkValue)))
    
    return prob # should be 0.018137235176105802
Example #10
0
 def test_getdata(self):
     data = getData('altman_93.txt', subDir='../Data/data_altman')
     self.assertEqual(data[0][0], 5260)