Beispiel #1
0
def ComputeCorrelations(heights, weights):
    """Compute correlations and least squares fit.

    heights: sequence
    weights: sequence
    """
    pearson = thinkstats2.Corr(heights, weights)
    assert almostEquals(pearson, 0.508736478973)
    print('Pearson correlation (weights):', pearson)

    log_weights = np.log(weights)
    log_pearson = thinkstats2.Corr(heights, log_weights)
    assert almostEquals(log_pearson, 0.531728260598)
    print('Pearson correlation (log weights):', log_pearson)

    spearman = thinkstats2.SpearmanCorr(heights, weights)
    print('Spearman correlation (weights):', spearman)
    assert almostEquals(spearman, 0.541535836332)

    inter, slope = thinkstats2.LeastSquares(heights, log_weights)
    print('Least squares inter, slope (log weights):', inter, slope)

    res = thinkstats2.Residuals(heights, log_weights, inter, slope)
    R2 = thinkstats2.CoefDetermination(log_weights, res)
    R = math.sqrt(R2)
    print('Coefficient of determination:', R2)
    print('sqrt(R^2):', R)

    assert almostEquals(R, log_pearson)
    def testCov(self):
        t = [0, 4, 7, 3, 8, 1, 6, 2, 9, 5]
        a = np.array(t)
        t2 = [5, 4, 3, 0, 8, 9, 7, 6, 2, 1]

        self.assertAlmostEqual(thinkstats2.Cov(t, a), 8.25)
        self.assertAlmostEqual(thinkstats2.Cov(t, -a), -8.25)

        self.assertAlmostEqual(thinkstats2.Corr(t, a), 1)
        self.assertAlmostEqual(thinkstats2.Corr(t, -a), -1)
        self.assertAlmostEqual(thinkstats2.Corr(t, t2), -0.1878787878)

        self.assertAlmostEqual(thinkstats2.SpearmanCorr(t, -a), -1)
        self.assertAlmostEqual(thinkstats2.SpearmanCorr(t, t2), -0.1878787878)
Beispiel #3
0
    def TestStatistic(self, data):
        """Computes the test statistic.

        data: tuple of xs and ys
        """
        x, y = data
        test_stat = abs(thinkstats2.Corr(x, y))
        return test_stat
Beispiel #4
0
def Correlations(df):
    print('pandas cov', df.htm3.cov(df.wtkg2))
    #print('NumPy cov', np.cov(df.htm3, df.wtkg2, ddof=0))
    print('thinkstats2 Cov', thinkstats2.Cov(df.htm3, df.wtkg2))
    print()

    print('pandas corr', df.htm3.corr(df.wtkg2))
    #print('NumPy corrcoef', np.corrcoef(df.htm3, df.wtkg2, ddof=0))
    print('thinkstats2 Corr', thinkstats2.Corr(df.htm3, df.wtkg2))
    print()

    print('pandas corr spearman', df.htm3.corr(df.wtkg2, method='spearman'))
    print('thinkstats2 SpearmanCorr',
          thinkstats2.SpearmanCorr(df.htm3, df.wtkg2))
    print('thinkstats2 SpearmanCorr log wtkg3',
          thinkstats2.SpearmanCorr(df.htm3, np.log(df.wtkg2)))
    print()

    print('thinkstats2 Corr log wtkg3',
          thinkstats2.Corr(df.htm3, np.log(df.wtkg2)))
    print()
def main(script):
    thinkstats2.RandomSeed(17)

    live, firsts, others = first.MakeFrames()
    live = live.dropna(subset=['agepreg', 'totalwgt_lb'])
    BinnedPercentiles(live)

    ages = live.agepreg
    weights = live.totalwgt_lb
    print('thinkstats2 Corr', thinkstats2.Corr(ages, weights))
    print('thinkstats2 SpearmanCorr', thinkstats2.SpearmanCorr(ages, weights))

    ScatterPlot(ages, weights, alpha=0.1)
    thinkplot.Save(root='chap07scatter1', legend=False, formats=['jpg'])
Beispiel #6
0
def PValue(xs, ys, n=10):
    actual = thinkstats2.Corr(xs, ys)

    xs_copy = list(xs)
    ys_copy = list(ys)

    corrs = []
    for i in range(n):
        corr = SimulateNull(xs_copy, ys_copy)
        corrs.append(corr)

    # what does the distribution of corrs look like?

    hits = [corr for corr in corrs if abs(corr) >= abs(actual)]
    p = len(hits) / float(n)
    return p
Beispiel #7
0
def main(name, data_dir='.'):
    xs, ys = ReadData(data_dir)

    thinkplot.Scatter(xs, ys, alpha=0.05)
    thinkplot.Save(root='correlate1',
                   xlabel='Age (years)',
                   ylabel='Birth weight (oz)',
                   axis=[9, 45, 0, 250])

    print 'Pearson', thinkstats2.Corr(xs, ys)
    print 'Spearman', thinkstats2.SpearmanCorr(xs, ys)

    for i in range(10):
        print SimulateNull(list(xs), list(ys))

    print PValue(xs, ys, 1000)
Beispiel #8
0
def main():
    random.seed(17)

    rho = -0.8
    res = CorrelatedGenerator(1000, rho)
    xs, ys = zip(*res)

    a = 1.0
    b = 0.0
    xs = [a * x + b for x in xs]

    print 'mean, var of x', thinkstats2.MeanVar(xs)
    print 'mean, var of y', thinkstats2.MeanVar(ys)
    print 'covariance', thinkstats2.Cov(xs, ys)
    print 'Pearson corr', thinkstats2.Corr(xs, ys)
    print 'Spearman corr', thinkstats2.SpearmanCorr(xs, ys)

    thinkplot.Scatter(xs, ys)
    thinkplot.Show()
Beispiel #9
0
def ComputeLeastSquares(ages, weights):
    """Computes least squares fit for ages and weights.

    Prints summary statistics.
    """
    # compute the correlation between age and weight
    print 'Pearson correlation', thinkstats2.Corr(ages, weights)
    print 'Spearman correlation', thinkstats2.SpearmanCorr(ages, weights)

    # compute least squares fit
    inter, slope = thinkstats2.LeastSquares(ages, weights)
    print '(inter, slope):', inter, slope

    res = thinkstats2.Residuals(ages, weights, inter, slope)
    R2 = thinkstats2.CoefDetermination(weights, res)

    print 'R^2', R2
    print
    return inter, slope, R2
Beispiel #10
0
def main():
    random.seed(17)

    rho = 0.8
    xs, ys = SatIqData(1000, rho)
    print 'mean, var of x', thinkstats2.MeanVar(xs)
    print 'mean, var of y', thinkstats2.MeanVar(ys)
    print 'Pearson corr', thinkstats2.Corr(xs, ys)

    inter, slope = thinkstats2.LeastSquares(xs, ys)
    print 'inter', inter
    print 'slope', slope

    fxs, fys = thinkstats2.FitLine(xs, inter, slope)
    res = thinkstats2.Residuals(xs, ys, inter, slope)
    R2 = thinkstats2.CoefDetermination(ys, res)
    print 'R2', R2

    thinkplot.Plot(fxs, fys, color='gray', alpha=0.2)
    thinkplot.Scatter(xs, ys)
    thinkplot.Show()
Beispiel #11
0
def ComputeAirlineArrivalDelayCorrelations(flights):
    """Compute the different correlations.
        This is similar to Correlations() in scatter.py
    """
    flights = flights.dropna(subset=['AIRLINE', 'ARRIVAL_DELAY'])
    print('pandas cov', flights.AIRLINE_CODE.cov(flights.ARRIVAL_DELAY))
    print('thinkstats2 Cov',
          thinkstats2.Cov(flights.AIRLINE_CODE, flights.ARRIVAL_DELAY))
    print()

    print('pandas corr Pearson',
          flights.AIRLINE_CODE.corr(flights.ARRIVAL_DELAY))
    print('thinkstats2 Corr Pearson',
          thinkstats2.Corr(flights.AIRLINE_CODE, flights.ARRIVAL_DELAY))
    print()

    print('pandas corr spearman',
          flights.AIRLINE_CODE.corr(flights.ARRIVAL_DELAY, method='spearman'))
    print(
        'thinkstats2 SpearmanCorr',
        thinkstats2.SpearmanCorr(flights.AIRLINE_CODE, flights.ARRIVAL_DELAY))
    print()
Beispiel #12
0
def SerialCorr(series, lag=1):
    xs = series[lag:]
    ys = series.shift(lag)[lag:]
    corr = thinkstats2.Corr(xs, ys)
    return corr
Beispiel #13
0
 def TestStatistic(self, data):
     xs, ys = data
     test_stat = abs(thinkstats2.Corr(xs, ys))
     return test_stat
 def TestStatistic(self, data):
     xs, ys = data
     test_stat = ts2.Corr(xs, ys)
     return test_stat
Beispiel #15
0
import thinkstats2
import thinkplot
import first
import numpy as np

live, firsts, others = first.MakeFrames()
live = live.dropna(subset=['agepreg', 'totalwgt_lb'])

rho = thinkstats2.Corr(live.agepreg, live.totalwgt_lb)
rho_s = thinkstats2.SpearmanCorr(live.agepreg, live.totalwgt_lb)
print('Pearson\'s Correlation, Mother\'s age and Birth weight: ', rho)
print('Spearman\'s Rank Correlation, Mother\'s age and Birth weight: ', rho_s)

thinkplot.LEGEND = False
thinkplot.Scatter(live.agepreg, live.totalwgt_lb)
#thinkplot.Show(xlabel = 'Mother\'s age', ylabel = 'Birth weight')
thinkplot.SaveFormat(root='age_weight_scatter',
                     fmt='png',
                     xlabel='Mothers\'s age',
                     ylabel='Birth weight')

thinkplot.LEGEND = True
bins = np.arange(10, 45, 2.5)
indices = np.digitize(live.agepreg, bins)
groups = live.groupby(indices)
ages = [group.agepreg.mean() for i, group in groups]
cdfs = [thinkstats2.Cdf(group.totalwgt_lb) for i, group in groups]
for percent in [75, 50, 25]:
    weights = [cdf.Percentile(percent) for cdf in cdfs]
    label = '%dth' % percent
    thinkplot.Plot(ages, weights, label=label)
Beispiel #16
0
def PlotSamplingDistributions(live):
    """Plots confidence intervals for the fitted curve and sampling dists.

    live: DataFrame
    """
    ages = live.agepreg
    weights = live.totalwgt_lb
    inter, slope = thinkstats2.LeastSquares(ages, weights)
    res = thinkstats2.Residuals(ages, weights, inter, slope)
    r2 = thinkstats2.CoefDetermination(weights, res)

    print('rho', thinkstats2.Corr(ages, weights))
    print('R2', r2)
    print('R', math.sqrt(r2))
    print('Std(ys)', thinkstats2.Std(weights))
    print('Std(res)', thinkstats2.Std(res))

    # plot the confidence intervals
    inters, slopes = SamplingDistributions(live, iters=1001)
    PlotConfidenceIntervals(ages,
                            inters,
                            slopes,
                            percent=90,
                            alpha=0.3,
                            label='90% CI')
    thinkplot.Text(42, 7.53, '90%')
    PlotConfidenceIntervals(ages,
                            inters,
                            slopes,
                            percent=50,
                            alpha=0.5,
                            label='50% CI')
    thinkplot.Text(42, 7.59, '50%')

    thinkplot.Save(root='linear3',
                   xlabel='age (years)',
                   ylabel='birth weight (lbs)',
                   legend=False)

    # plot the confidence intervals
    thinkplot.PrePlot(2)
    thinkplot.Scatter(ages, weights, color='gray', alpha=0.1)
    PlotConfidenceIntervals(ages, inters, slopes, res=res, alpha=0.2)
    PlotConfidenceIntervals(ages, inters, slopes)
    thinkplot.Save(root='linear5',
                   xlabel='age (years)',
                   ylabel='birth weight (lbs)',
                   title='90% CI',
                   axis=[10, 45, 0, 15],
                   legend=False)

    # plot the sampling distribution of slope under null hypothesis
    # and alternate hypothesis
    sampling_cdf = thinkstats2.Cdf(slopes)
    print('p-value, sampling distribution', sampling_cdf[0])

    ht = SlopeTest((ages, weights))
    pvalue = ht.PValue()
    print('p-value, slope test', pvalue)

    print('inter', inter, thinkstats2.Mean(inters))
    Summarize(inters, inter)
    print('slope', slope, thinkstats2.Mean(slopes))
    Summarize(slopes, slope)

    thinkplot.PrePlot(2)
    thinkplot.Plot([0, 0], [0, 1], color='0.8')
    ht.PlotCdf(label='null hypothesis')
    thinkplot.Cdf(sampling_cdf, label='sampling distribution')
    thinkplot.Save(root='linear4',
                   xlabel='slope (lbs / year)',
                   ylabel='CDF',
                   xlim=[-0.03, 0.03],
                   loc='upper left')
Beispiel #17
0
groups = live_ss.groupby(indices)

age_means = [g.agepreg.mean() for i, g in groups]
wgt_cdfs = [thinkstats2.Cdf(g.totalwgt_lb) for i, g in groups]

percentiles = [75, 50, 25]
thinkplot.PrePlot(len(percentiles))
for percent in percentiles:
    wgt_percentile = [cdf.Percentile(percent) for cdf in wgt_cdfs]
    label = '%dth' % percent
    thinkplot.Plot(age_means, wgt_percentile, label=label)
thinkplot.Config(xlabel='Mother age (years)',
                 ylabel='Birth weight (lbs)',
                 legend=True)

p_corr = thinkstats2.Corr(live_ss.agepreg, live_ss.totalwgt_lb)
s_corr = thinkstats2.SpearmanCorr(live_ss.agepreg, live_ss.totalwgt_lb)
print('Pearson\'s Correlation:', p_corr)
print('Spearman\'s Correlation:', s_corr)


#--- Chapter8 Ex2
def SimulateSample(lam=2, n=10, iters=1000):
    lams_est = []
    for m in np.arange(iters):
        xs = np.random.exponential(1.0 / lam, n)
        L = 1 / np.mean(xs)
        lams_est.append(L)
    return lams_est

Beispiel #18
0
def CorrelationPlots(df,
                     xlabel,
                     ylabel,
                     xjitter=0,
                     yjitter=0,
                     axis=None,
                     nbins=5,
                     **options):

    cleaned = df.dropna(subset=[xlabel, ylabel])
    xs = cleaned[xlabel]
    ys = cleaned[ylabel]

    xs = thinkstats2.Jitter(xs, xjitter)
    ys = thinkstats2.Jitter(ys, yjitter)

    xmin, xmax = min(xs), max(xs)
    ymin, ymax = min(ys), max(ys)
    if axis is None:
        axis = [xmin, xmax, ymin, ymax]

    PrePlot(num=4, rows=2, cols=2)

    # make scatter plot
    SubPlot(1)
    Scatter(xs, ys, alpha=0.1, s=10)
    Config(xlabel=xlabel, ylabel=ylabel, axis=axis, legend=False)

    # make HexBin plot
    SubPlot(2)
    HexBin(xs, ys)
    Config(xlabel=xlabel, ylabel=ylabel, axis=axis, legend=False)

    # plot percentiles
    SubPlot(3)

    xs_cdf = thinkstats2.Cdf(xs)
    lower = xs_cdf.Percentile(1)
    upper = xs_cdf.Percentile(99)

    bins = np.arange(lower, upper, nbins)
    indices = np.digitize(xs, bins)
    groups = cleaned.groupby(indices)
    mean_xs = [group[xlabel].mean() for i, group in groups]
    cdfs = [thinkstats2.Cdf(group[ylabel]) for i, group in groups]

    for percent in [75, 50, 25]:
        y_percentiles = [cdf.Percentile(percent) for cdf in cdfs]
        label = '%dth' % percent
        Plot(mean_xs, y_percentiles, label=label)

    Config(xlabel=xlabel, ylabel=ylabel, axis=axis, legend=True)

    # plot CDFs
    n = (upper - lower) // (nbins - 2)
    bins = np.arange(lower, upper, n)
    indices = np.digitize(cleaned[xlabel], bins)
    groups = cleaned.groupby(indices)
    mean_xs = [group[xlabel].mean() for i, group in groups]
    cdfs = [thinkstats2.Cdf(group[ylabel]) for i, group in groups]

    ## plot the cdfs
    SubPlot(4)
    PrePlot(len(cdfs))
    for i, cdf in enumerate(cdfs):
        if i == 0:
            label = '<%d ' % bins[0] + xlabel
        elif i == len(cdfs) - 1:
            label = '>%d ' % bins[-1] + xlabel
        else:
            label = '%d - %d ' % (bins[i - 1], bins[i]) + xlabel
        Cdf(cdf, label=label)
        Config(xlabel=ylabel, ylabel='CDF', legend=True)

    #print statistics
    print('Correlation:\n', thinkstats2.Corr(xs, ys))
    print('Spearman Correlation Coefficient:\n',
          thinkstats2.SpearmanCorr(xs, ys))
Beispiel #19
0
    return greq, less


def SplitFrames(df):
    df = df.dropna(subset=['agepreg', 'totalwgt_lb'])
    age = df.agepreg
    wgt = df.totalwgt_lb
    return age, wgt


def PlotScatter(age, wgt, xmin, xmax, ymin, ymax):
    thinkplot.Scatter(age, wgt, alpha=1.0)
    thinkplot.Config(xlabel='Age (Years)',
                     ylabel='Birth Weight (lbs)',
                     xlim=[xmin, xmax],
                     ylim=[ymin, ymax],
                     legend=False)
    thinkplot.Show()


greq, less = MakeFrames()
greqage, greqwgt = SplitFrames(greq)
lessage, lesswgt = SplitFrames(less)
PlotScatter(greqage, greqwgt, 30, 50, 0, 14)
PlotScatter(lessage, lesswgt, 5, 30, 0, 14)
print "Greq 30 Pearson's corr:", thinkstats2.Corr(greqage, greqwgt)
print "Greq 30 Spearman corr:", thinkstats2.SpearmanCorr(greqage, greqwgt)
print "Less 30 Pearson's corr:", thinkstats2.Corr(lessage, lesswgt)
print "Less 30 Spearman corr:", thinkstats2.SpearmanCorr(lessage, lesswgt)
groups = data.groupby(indices)

means = [group.htm3.mean() for i, group in groups][1:-1]
cdfs = [thinkstats2.Cdf(group.residual) for i, group in groups][1:-1]

# plot the pencitles
for p in [75, 50, 25]:
    ys = [cdf.Percentile(p) for cdf in cdfs]
    label = str(p) + 'th'
    thinkplot.Plot(means, ys, label=label)

thinkplot.Config(xlabel='height (cm)', ylabel='residual weight (kg)')

#%%
# calculate correlation and coefficient of determination
rho = thinkstats2.Corr(heights, logWeight)
r2 = thinkstats2.CoefDetermination(logWeight, res)

# check if R^2 = rho^2
print("Correlation: {:.3f}".format(rho))
print("Coefficent of determination: {:.3f}".format(r2))

print("R^2 - rho^2: {:.3f}".format(rho**2 - r2))

#%%
# calc standard deviation (RMSE) of prediction w/o height
std_ys = thinkstats2.Std(logWeight)
print("Standard deviation w/o height: {:.3f}".format(std_ys))

#%%
# calc standard deviation (RMSE) of prediction w/ height
Beispiel #21
0
def SimulateNull(xs, ys):
    random.shuffle(xs)
    random.shuffle(ys)
    return thinkstats2.Corr(xs, ys)
Beispiel #22
0
cdf = thinkstats2.Cdf(df.Age)
thinkplot.Cdf(cdf)
thinkplot.Config(xlabel='Age', ylabel='CDF')

#plot normal distribution
mean = df.Age.mean()
std = df.Age.std()
xs = [-4, 4]
fxs, fys = thinkstats2.FitLine(xs, inter=mean, slope=std)
thinkplot.Plot(fxs, fys, color='gray', label='model')
xs, ys = thinkstats2.NormalProbability(df.Age)
thinkplot.Plot(xs, ys, label='Age')

#scatter plots and correlation
#year vs. age
year = thinkstats2.Jitter(df.Year, .25)
thinkplot.Scatter(year, df.Age)
thinkplot.Show(xlabel='Year', ylabel='Age')
thinkstats2.Corr(df.Year, df.Age)
#drug vs. age
thinkplot.Scatter(df.Age, df.Drug)
thinkplot.Show(xlabel='Age', ylabel='Drug')

#testing a difference in gender
data = male.Age.values, female.Age.values
ht = DiffMeansPermute(data)
pvalue = ht.PValue()
print(pvalue)
ht.PlotCdf()
thinkplot.Config(xlabel='test statistic', ylabel='CDF')
Beispiel #23
0
    # bin the data
    bins = np.arange(120, 200, 6)
    indices = np.digitize(df.htm3, bins)
    groups = df.groupby(indices)

    # make cdfs
    height_means = [group.htm3.mean() for _, group in groups][1:-1]
    cdfs = [thinkstats2.Cdf(group.residual) for _, group in groups][1:-1]

    # make plot of percentiles
    PlotPercentileLines(height_means, cdfs,
                        xlabel='height(cm)',
                        ylabel='residual log_10 weight (log_10 kg)')

    ## calculate correlation
    rho = thinkstats2.Corr(heights, log_weights)
    print('rho:\n',rho)

    ## coefficient of determination
    res = df.residual
    r2 = CoefDetermination(log_weights, res)
    print('r2:\n',r2)

    ## confirm that R^2 = rho^2
    print('rho**2:\n',rho**2)
    print('r2:\n',r2)

    ## Std(ys)
    print('Std(log_weights):\n',Std(log_weights))
    print('Std(res):\n',Std(res))
    ratio = 1 - (Std(res) / Std(log_weights))