Ejemplo n.º 1
0
def ComputeCorrelations(heights, weights):
    """Compute correlations and least squares fit.

    heights: sequence
    weights: sequence
    """
    pearson = thinkstats2.Corr(heights, weights)
    assert almostEquals(pearson, 0.508736478973)
    print('Pearson correlation (weights):', pearson)

    log_weights = np.log(weights)
    log_pearson = thinkstats2.Corr(heights, log_weights)
    assert almostEquals(log_pearson, 0.531728260598)
    print('Pearson correlation (log weights):', log_pearson)

    spearman = thinkstats2.SpearmanCorr(heights, weights)
    print('Spearman correlation (weights):', spearman)
    assert almostEquals(spearman, 0.541535836332)

    inter, slope = thinkstats2.LeastSquares(heights, log_weights)
    print('Least squares inter, slope (log weights):', inter, slope)

    res = thinkstats2.Residuals(heights, log_weights, inter, slope)
    R2 = thinkstats2.CoefDetermination(log_weights, res)
    R = math.sqrt(R2)
    print('Coefficient of determination:', R2)
    print('sqrt(R^2):', R)

    assert almostEquals(R, log_pearson)
Ejemplo n.º 2
0
    def testCov(self):
        t = [0, 4, 7, 3, 8, 1, 6, 2, 9, 5]
        a = np.array(t)
        t2 = [5, 4, 3, 0, 8, 9, 7, 6, 2, 1]

        self.assertAlmostEqual(thinkstats2.Cov(t, a), 8.25)
        self.assertAlmostEqual(thinkstats2.Cov(t, -a), -8.25)

        self.assertAlmostEqual(thinkstats2.Corr(t, a), 1)
        self.assertAlmostEqual(thinkstats2.Corr(t, -a), -1)
        self.assertAlmostEqual(thinkstats2.Corr(t, t2), -0.1878787878)

        self.assertAlmostEqual(thinkstats2.SpearmanCorr(t, -a), -1)
        self.assertAlmostEqual(thinkstats2.SpearmanCorr(t, t2), -0.1878787878)
Ejemplo n.º 3
0
    def TestStatistic(self, data):
        """Computes the test statistic.

        data: tuple of xs and ys
        """
        x, y = data
        test_stat = abs(thinkstats2.Corr(x, y))
        return test_stat
Ejemplo n.º 4
0
def Correlations(df):
    print('pandas cov', df.htm3.cov(df.wtkg2))
    #print('NumPy cov', np.cov(df.htm3, df.wtkg2, ddof=0))
    print('thinkstats2 Cov', thinkstats2.Cov(df.htm3, df.wtkg2))
    print()

    print('pandas corr', df.htm3.corr(df.wtkg2))
    #print('NumPy corrcoef', np.corrcoef(df.htm3, df.wtkg2, ddof=0))
    print('thinkstats2 Corr', thinkstats2.Corr(df.htm3, df.wtkg2))
    print()

    print('pandas corr spearman', df.htm3.corr(df.wtkg2, method='spearman'))
    print('thinkstats2 SpearmanCorr',
          thinkstats2.SpearmanCorr(df.htm3, df.wtkg2))
    print('thinkstats2 SpearmanCorr log wtkg3',
          thinkstats2.SpearmanCorr(df.htm3, np.log(df.wtkg2)))
    print()

    print('thinkstats2 Corr log wtkg3',
          thinkstats2.Corr(df.htm3, np.log(df.wtkg2)))
    print()
def main(script):
    thinkstats2.RandomSeed(17)

    live, firsts, others = first.MakeFrames()
    live = live.dropna(subset=['agepreg', 'totalwgt_lb'])
    BinnedPercentiles(live)

    ages = live.agepreg
    weights = live.totalwgt_lb
    print('thinkstats2 Corr', thinkstats2.Corr(ages, weights))
    print('thinkstats2 SpearmanCorr', thinkstats2.SpearmanCorr(ages, weights))

    ScatterPlot(ages, weights, alpha=0.1)
    thinkplot.Save(root='chap07scatter1', legend=False, formats=['jpg'])
Ejemplo n.º 6
0
def PValue(xs, ys, n=10):
    actual = thinkstats2.Corr(xs, ys)

    xs_copy = list(xs)
    ys_copy = list(ys)

    corrs = []
    for i in range(n):
        corr = SimulateNull(xs_copy, ys_copy)
        corrs.append(corr)

    # what does the distribution of corrs look like?

    hits = [corr for corr in corrs if abs(corr) >= abs(actual)]
    p = len(hits) / float(n)
    return p
Ejemplo n.º 7
0
def main(name, data_dir='.'):
    xs, ys = ReadData(data_dir)

    thinkplot.Scatter(xs, ys, alpha=0.05)
    thinkplot.Save(root='correlate1',
                   xlabel='Age (years)',
                   ylabel='Birth weight (oz)',
                   axis=[9, 45, 0, 250])

    print 'Pearson', thinkstats2.Corr(xs, ys)
    print 'Spearman', thinkstats2.SpearmanCorr(xs, ys)

    for i in range(10):
        print SimulateNull(list(xs), list(ys))

    print PValue(xs, ys, 1000)
Ejemplo n.º 8
0
def main():
    random.seed(17)

    rho = -0.8
    res = CorrelatedGenerator(1000, rho)
    xs, ys = zip(*res)

    a = 1.0
    b = 0.0
    xs = [a * x + b for x in xs]

    print 'mean, var of x', thinkstats2.MeanVar(xs)
    print 'mean, var of y', thinkstats2.MeanVar(ys)
    print 'covariance', thinkstats2.Cov(xs, ys)
    print 'Pearson corr', thinkstats2.Corr(xs, ys)
    print 'Spearman corr', thinkstats2.SpearmanCorr(xs, ys)

    thinkplot.Scatter(xs, ys)
    thinkplot.Show()
Ejemplo n.º 9
0
def ComputeLeastSquares(ages, weights):
    """Computes least squares fit for ages and weights.

    Prints summary statistics.
    """
    # compute the correlation between age and weight
    print 'Pearson correlation', thinkstats2.Corr(ages, weights)
    print 'Spearman correlation', thinkstats2.SpearmanCorr(ages, weights)

    # compute least squares fit
    inter, slope = thinkstats2.LeastSquares(ages, weights)
    print '(inter, slope):', inter, slope

    res = thinkstats2.Residuals(ages, weights, inter, slope)
    R2 = thinkstats2.CoefDetermination(weights, res)

    print 'R^2', R2
    print
    return inter, slope, R2
Ejemplo n.º 10
0
def main():
    random.seed(17)

    rho = 0.8
    xs, ys = SatIqData(1000, rho)
    print 'mean, var of x', thinkstats2.MeanVar(xs)
    print 'mean, var of y', thinkstats2.MeanVar(ys)
    print 'Pearson corr', thinkstats2.Corr(xs, ys)

    inter, slope = thinkstats2.LeastSquares(xs, ys)
    print 'inter', inter
    print 'slope', slope

    fxs, fys = thinkstats2.FitLine(xs, inter, slope)
    res = thinkstats2.Residuals(xs, ys, inter, slope)
    R2 = thinkstats2.CoefDetermination(ys, res)
    print 'R2', R2

    thinkplot.Plot(fxs, fys, color='gray', alpha=0.2)
    thinkplot.Scatter(xs, ys)
    thinkplot.Show()
Ejemplo n.º 11
0
def ComputeAirlineArrivalDelayCorrelations(flights):
    """Compute the different correlations.
        This is similar to Correlations() in scatter.py
    """
    flights = flights.dropna(subset=['AIRLINE', 'ARRIVAL_DELAY'])
    print('pandas cov', flights.AIRLINE_CODE.cov(flights.ARRIVAL_DELAY))
    print('thinkstats2 Cov',
          thinkstats2.Cov(flights.AIRLINE_CODE, flights.ARRIVAL_DELAY))
    print()

    print('pandas corr Pearson',
          flights.AIRLINE_CODE.corr(flights.ARRIVAL_DELAY))
    print('thinkstats2 Corr Pearson',
          thinkstats2.Corr(flights.AIRLINE_CODE, flights.ARRIVAL_DELAY))
    print()

    print('pandas corr spearman',
          flights.AIRLINE_CODE.corr(flights.ARRIVAL_DELAY, method='spearman'))
    print(
        'thinkstats2 SpearmanCorr',
        thinkstats2.SpearmanCorr(flights.AIRLINE_CODE, flights.ARRIVAL_DELAY))
    print()
Ejemplo n.º 12
0
def SerialCorr(series, lag=1):
    xs = series[lag:]
    ys = series.shift(lag)[lag:]
    corr = thinkstats2.Corr(xs, ys)
    return corr
Ejemplo n.º 13
0
 def TestStatistic(self, data):
     xs, ys = data
     test_stat = abs(thinkstats2.Corr(xs, ys))
     return test_stat
Ejemplo n.º 14
0
 def TestStatistic(self, data):
     xs, ys = data
     test_stat = ts2.Corr(xs, ys)
     return test_stat
Ejemplo n.º 15
0
import thinkstats2
import thinkplot
import first
import numpy as np

live, firsts, others = first.MakeFrames()
live = live.dropna(subset=['agepreg', 'totalwgt_lb'])

rho = thinkstats2.Corr(live.agepreg, live.totalwgt_lb)
rho_s = thinkstats2.SpearmanCorr(live.agepreg, live.totalwgt_lb)
print('Pearson\'s Correlation, Mother\'s age and Birth weight: ', rho)
print('Spearman\'s Rank Correlation, Mother\'s age and Birth weight: ', rho_s)

thinkplot.LEGEND = False
thinkplot.Scatter(live.agepreg, live.totalwgt_lb)
#thinkplot.Show(xlabel = 'Mother\'s age', ylabel = 'Birth weight')
thinkplot.SaveFormat(root='age_weight_scatter',
                     fmt='png',
                     xlabel='Mothers\'s age',
                     ylabel='Birth weight')

thinkplot.LEGEND = True
bins = np.arange(10, 45, 2.5)
indices = np.digitize(live.agepreg, bins)
groups = live.groupby(indices)
ages = [group.agepreg.mean() for i, group in groups]
cdfs = [thinkstats2.Cdf(group.totalwgt_lb) for i, group in groups]
for percent in [75, 50, 25]:
    weights = [cdf.Percentile(percent) for cdf in cdfs]
    label = '%dth' % percent
    thinkplot.Plot(ages, weights, label=label)
Ejemplo n.º 16
0
def PlotSamplingDistributions(live):
    """Plots confidence intervals for the fitted curve and sampling dists.

    live: DataFrame
    """
    ages = live.agepreg
    weights = live.totalwgt_lb
    inter, slope = thinkstats2.LeastSquares(ages, weights)
    res = thinkstats2.Residuals(ages, weights, inter, slope)
    r2 = thinkstats2.CoefDetermination(weights, res)

    print('rho', thinkstats2.Corr(ages, weights))
    print('R2', r2)
    print('R', math.sqrt(r2))
    print('Std(ys)', thinkstats2.Std(weights))
    print('Std(res)', thinkstats2.Std(res))

    # plot the confidence intervals
    inters, slopes = SamplingDistributions(live, iters=1001)
    PlotConfidenceIntervals(ages,
                            inters,
                            slopes,
                            percent=90,
                            alpha=0.3,
                            label='90% CI')
    thinkplot.Text(42, 7.53, '90%')
    PlotConfidenceIntervals(ages,
                            inters,
                            slopes,
                            percent=50,
                            alpha=0.5,
                            label='50% CI')
    thinkplot.Text(42, 7.59, '50%')

    thinkplot.Save(root='linear3',
                   xlabel='age (years)',
                   ylabel='birth weight (lbs)',
                   legend=False)

    # plot the confidence intervals
    thinkplot.PrePlot(2)
    thinkplot.Scatter(ages, weights, color='gray', alpha=0.1)
    PlotConfidenceIntervals(ages, inters, slopes, res=res, alpha=0.2)
    PlotConfidenceIntervals(ages, inters, slopes)
    thinkplot.Save(root='linear5',
                   xlabel='age (years)',
                   ylabel='birth weight (lbs)',
                   title='90% CI',
                   axis=[10, 45, 0, 15],
                   legend=False)

    # plot the sampling distribution of slope under null hypothesis
    # and alternate hypothesis
    sampling_cdf = thinkstats2.Cdf(slopes)
    print('p-value, sampling distribution', sampling_cdf[0])

    ht = SlopeTest((ages, weights))
    pvalue = ht.PValue()
    print('p-value, slope test', pvalue)

    print('inter', inter, thinkstats2.Mean(inters))
    Summarize(inters, inter)
    print('slope', slope, thinkstats2.Mean(slopes))
    Summarize(slopes, slope)

    thinkplot.PrePlot(2)
    thinkplot.Plot([0, 0], [0, 1], color='0.8')
    ht.PlotCdf(label='null hypothesis')
    thinkplot.Cdf(sampling_cdf, label='sampling distribution')
    thinkplot.Save(root='linear4',
                   xlabel='slope (lbs / year)',
                   ylabel='CDF',
                   xlim=[-0.03, 0.03],
                   loc='upper left')
Ejemplo n.º 17
0
groups = live_ss.groupby(indices)

age_means = [g.agepreg.mean() for i, g in groups]
wgt_cdfs = [thinkstats2.Cdf(g.totalwgt_lb) for i, g in groups]

percentiles = [75, 50, 25]
thinkplot.PrePlot(len(percentiles))
for percent in percentiles:
    wgt_percentile = [cdf.Percentile(percent) for cdf in wgt_cdfs]
    label = '%dth' % percent
    thinkplot.Plot(age_means, wgt_percentile, label=label)
thinkplot.Config(xlabel='Mother age (years)',
                 ylabel='Birth weight (lbs)',
                 legend=True)

p_corr = thinkstats2.Corr(live_ss.agepreg, live_ss.totalwgt_lb)
s_corr = thinkstats2.SpearmanCorr(live_ss.agepreg, live_ss.totalwgt_lb)
print('Pearson\'s Correlation:', p_corr)
print('Spearman\'s Correlation:', s_corr)


#--- Chapter8 Ex2
def SimulateSample(lam=2, n=10, iters=1000):
    lams_est = []
    for m in np.arange(iters):
        xs = np.random.exponential(1.0 / lam, n)
        L = 1 / np.mean(xs)
        lams_est.append(L)
    return lams_est

Ejemplo n.º 18
0
def CorrelationPlots(df,
                     xlabel,
                     ylabel,
                     xjitter=0,
                     yjitter=0,
                     axis=None,
                     nbins=5,
                     **options):

    cleaned = df.dropna(subset=[xlabel, ylabel])
    xs = cleaned[xlabel]
    ys = cleaned[ylabel]

    xs = thinkstats2.Jitter(xs, xjitter)
    ys = thinkstats2.Jitter(ys, yjitter)

    xmin, xmax = min(xs), max(xs)
    ymin, ymax = min(ys), max(ys)
    if axis is None:
        axis = [xmin, xmax, ymin, ymax]

    PrePlot(num=4, rows=2, cols=2)

    # make scatter plot
    SubPlot(1)
    Scatter(xs, ys, alpha=0.1, s=10)
    Config(xlabel=xlabel, ylabel=ylabel, axis=axis, legend=False)

    # make HexBin plot
    SubPlot(2)
    HexBin(xs, ys)
    Config(xlabel=xlabel, ylabel=ylabel, axis=axis, legend=False)

    # plot percentiles
    SubPlot(3)

    xs_cdf = thinkstats2.Cdf(xs)
    lower = xs_cdf.Percentile(1)
    upper = xs_cdf.Percentile(99)

    bins = np.arange(lower, upper, nbins)
    indices = np.digitize(xs, bins)
    groups = cleaned.groupby(indices)
    mean_xs = [group[xlabel].mean() for i, group in groups]
    cdfs = [thinkstats2.Cdf(group[ylabel]) for i, group in groups]

    for percent in [75, 50, 25]:
        y_percentiles = [cdf.Percentile(percent) for cdf in cdfs]
        label = '%dth' % percent
        Plot(mean_xs, y_percentiles, label=label)

    Config(xlabel=xlabel, ylabel=ylabel, axis=axis, legend=True)

    # plot CDFs
    n = (upper - lower) // (nbins - 2)
    bins = np.arange(lower, upper, n)
    indices = np.digitize(cleaned[xlabel], bins)
    groups = cleaned.groupby(indices)
    mean_xs = [group[xlabel].mean() for i, group in groups]
    cdfs = [thinkstats2.Cdf(group[ylabel]) for i, group in groups]

    ## plot the cdfs
    SubPlot(4)
    PrePlot(len(cdfs))
    for i, cdf in enumerate(cdfs):
        if i == 0:
            label = '<%d ' % bins[0] + xlabel
        elif i == len(cdfs) - 1:
            label = '>%d ' % bins[-1] + xlabel
        else:
            label = '%d - %d ' % (bins[i - 1], bins[i]) + xlabel
        Cdf(cdf, label=label)
        Config(xlabel=ylabel, ylabel='CDF', legend=True)

    #print statistics
    print('Correlation:\n', thinkstats2.Corr(xs, ys))
    print('Spearman Correlation Coefficient:\n',
          thinkstats2.SpearmanCorr(xs, ys))
Ejemplo n.º 19
0
    return greq, less


def SplitFrames(df):
    df = df.dropna(subset=['agepreg', 'totalwgt_lb'])
    age = df.agepreg
    wgt = df.totalwgt_lb
    return age, wgt


def PlotScatter(age, wgt, xmin, xmax, ymin, ymax):
    thinkplot.Scatter(age, wgt, alpha=1.0)
    thinkplot.Config(xlabel='Age (Years)',
                     ylabel='Birth Weight (lbs)',
                     xlim=[xmin, xmax],
                     ylim=[ymin, ymax],
                     legend=False)
    thinkplot.Show()


greq, less = MakeFrames()
greqage, greqwgt = SplitFrames(greq)
lessage, lesswgt = SplitFrames(less)
PlotScatter(greqage, greqwgt, 30, 50, 0, 14)
PlotScatter(lessage, lesswgt, 5, 30, 0, 14)
print "Greq 30 Pearson's corr:", thinkstats2.Corr(greqage, greqwgt)
print "Greq 30 Spearman corr:", thinkstats2.SpearmanCorr(greqage, greqwgt)
print "Less 30 Pearson's corr:", thinkstats2.Corr(lessage, lesswgt)
print "Less 30 Spearman corr:", thinkstats2.SpearmanCorr(lessage, lesswgt)
groups = data.groupby(indices)

means = [group.htm3.mean() for i, group in groups][1:-1]
cdfs = [thinkstats2.Cdf(group.residual) for i, group in groups][1:-1]

# plot the pencitles
for p in [75, 50, 25]:
    ys = [cdf.Percentile(p) for cdf in cdfs]
    label = str(p) + 'th'
    thinkplot.Plot(means, ys, label=label)

thinkplot.Config(xlabel='height (cm)', ylabel='residual weight (kg)')

#%%
# calculate correlation and coefficient of determination
rho = thinkstats2.Corr(heights, logWeight)
r2 = thinkstats2.CoefDetermination(logWeight, res)

# check if R^2 = rho^2
print("Correlation: {:.3f}".format(rho))
print("Coefficent of determination: {:.3f}".format(r2))

print("R^2 - rho^2: {:.3f}".format(rho**2 - r2))

#%%
# calc standard deviation (RMSE) of prediction w/o height
std_ys = thinkstats2.Std(logWeight)
print("Standard deviation w/o height: {:.3f}".format(std_ys))

#%%
# calc standard deviation (RMSE) of prediction w/ height
Ejemplo n.º 21
0
def SimulateNull(xs, ys):
    random.shuffle(xs)
    random.shuffle(ys)
    return thinkstats2.Corr(xs, ys)
Ejemplo n.º 22
0
cdf = thinkstats2.Cdf(df.Age)
thinkplot.Cdf(cdf)
thinkplot.Config(xlabel='Age', ylabel='CDF')

#plot normal distribution
mean = df.Age.mean()
std = df.Age.std()
xs = [-4, 4]
fxs, fys = thinkstats2.FitLine(xs, inter=mean, slope=std)
thinkplot.Plot(fxs, fys, color='gray', label='model')
xs, ys = thinkstats2.NormalProbability(df.Age)
thinkplot.Plot(xs, ys, label='Age')

#scatter plots and correlation
#year vs. age
year = thinkstats2.Jitter(df.Year, .25)
thinkplot.Scatter(year, df.Age)
thinkplot.Show(xlabel='Year', ylabel='Age')
thinkstats2.Corr(df.Year, df.Age)
#drug vs. age
thinkplot.Scatter(df.Age, df.Drug)
thinkplot.Show(xlabel='Age', ylabel='Drug')

#testing a difference in gender
data = male.Age.values, female.Age.values
ht = DiffMeansPermute(data)
pvalue = ht.PValue()
print(pvalue)
ht.PlotCdf()
thinkplot.Config(xlabel='test statistic', ylabel='CDF')
Ejemplo n.º 23
0
    # bin the data
    bins = np.arange(120, 200, 6)
    indices = np.digitize(df.htm3, bins)
    groups = df.groupby(indices)

    # make cdfs
    height_means = [group.htm3.mean() for _, group in groups][1:-1]
    cdfs = [thinkstats2.Cdf(group.residual) for _, group in groups][1:-1]

    # make plot of percentiles
    PlotPercentileLines(height_means, cdfs,
                        xlabel='height(cm)',
                        ylabel='residual log_10 weight (log_10 kg)')

    ## calculate correlation
    rho = thinkstats2.Corr(heights, log_weights)
    print('rho:\n',rho)

    ## coefficient of determination
    res = df.residual
    r2 = CoefDetermination(log_weights, res)
    print('r2:\n',r2)

    ## confirm that R^2 = rho^2
    print('rho**2:\n',rho**2)
    print('r2:\n',r2)

    ## Std(ys)
    print('Std(log_weights):\n',Std(log_weights))
    print('Std(res):\n',Std(res))
    ratio = 1 - (Std(res) / Std(log_weights))