Ejemplo n.º 1
0
    def testCov(self):
        t = [0, 4, 7, 3, 8, 1, 6, 2, 9, 5]
        a = np.array(t)
        t2 = [5, 4, 3, 0, 8, 9, 7, 6, 2, 1]

        self.assertAlmostEqual(thinkstats2.Cov(t, a), 8.25)
        self.assertAlmostEqual(thinkstats2.Cov(t, -a), -8.25)

        self.assertAlmostEqual(thinkstats2.Corr(t, a), 1)
        self.assertAlmostEqual(thinkstats2.Corr(t, -a), -1)
        self.assertAlmostEqual(thinkstats2.Corr(t, t2), -0.1878787878)

        self.assertAlmostEqual(thinkstats2.SpearmanCorr(t, -a), -1)
        self.assertAlmostEqual(thinkstats2.SpearmanCorr(t, t2), -0.1878787878)
Ejemplo n.º 2
0
def ComputeCorrelations(heights, weights):
    """Compute correlations and least squares fit.

    heights: sequence
    weights: sequence
    """
    pearson = thinkstats2.Corr(heights, weights)
    assert almostEquals(pearson, 0.508736478973)
    print('Pearson correlation (weights):', pearson)

    log_weights = np.log(weights)
    log_pearson = thinkstats2.Corr(heights, log_weights)
    assert almostEquals(log_pearson, 0.531728260598)
    print('Pearson correlation (log weights):', log_pearson)

    spearman = thinkstats2.SpearmanCorr(heights, weights)
    print('Spearman correlation (weights):', spearman)
    assert almostEquals(spearman, 0.541535836332)

    inter, slope = thinkstats2.LeastSquares(heights, log_weights)
    print('Least squares inter, slope (log weights):', inter, slope)

    res = thinkstats2.Residuals(heights, log_weights, inter, slope)
    R2 = thinkstats2.CoefDetermination(log_weights, res)
    R = math.sqrt(R2)
    print('Coefficient of determination:', R2)
    print('sqrt(R^2):', R)

    assert almostEquals(R, log_pearson)
Ejemplo n.º 3
0
def Correlations(df):
    print('pandas cov', df.htm3.cov(df.wtkg2))
    #print('NumPy cov', np.cov(df.htm3, df.wtkg2, ddof=0))
    print('thinkstats2 Cov', thinkstats2.Cov(df.htm3, df.wtkg2))
    print()

    print('pandas corr', df.htm3.corr(df.wtkg2))
    #print('NumPy corrcoef', np.corrcoef(df.htm3, df.wtkg2, ddof=0))
    print('thinkstats2 Corr', thinkstats2.Corr(df.htm3, df.wtkg2))
    print()

    print('pandas corr spearman', df.htm3.corr(df.wtkg2, method='spearman'))
    print('thinkstats2 SpearmanCorr',
          thinkstats2.SpearmanCorr(df.htm3, df.wtkg2))
    print('thinkstats2 SpearmanCorr log wtkg3',
          thinkstats2.SpearmanCorr(df.htm3, np.log(df.wtkg2)))
    print()

    print('thinkstats2 Corr log wtkg3',
          thinkstats2.Corr(df.htm3, np.log(df.wtkg2)))
    print()
def main(script):
    thinkstats2.RandomSeed(17)

    live, firsts, others = first.MakeFrames()
    live = live.dropna(subset=['agepreg', 'totalwgt_lb'])
    BinnedPercentiles(live)

    ages = live.agepreg
    weights = live.totalwgt_lb
    print('thinkstats2 Corr', thinkstats2.Corr(ages, weights))
    print('thinkstats2 SpearmanCorr', thinkstats2.SpearmanCorr(ages, weights))

    ScatterPlot(ages, weights, alpha=0.1)
    thinkplot.Save(root='chap07scatter1', legend=False, formats=['jpg'])
Ejemplo n.º 5
0
def main(name, data_dir='.'):
    xs, ys = ReadData(data_dir)

    thinkplot.Scatter(xs, ys, alpha=0.05)
    thinkplot.Save(root='correlate1',
                   xlabel='Age (years)',
                   ylabel='Birth weight (oz)',
                   axis=[9, 45, 0, 250])

    print 'Pearson', thinkstats2.Corr(xs, ys)
    print 'Spearman', thinkstats2.SpearmanCorr(xs, ys)

    for i in range(10):
        print SimulateNull(list(xs), list(ys))

    print PValue(xs, ys, 1000)
def scatter(x):
    tot_crimes = df.Total_crimes
    thinkplot.Scatter(df[x], tot_crimes, alpha=.5)
    if x == 'month':
        thinkplot.Show(title="Total Crimes vs Time",
                       xlabel="Year",
                       ylabel="Total Crimes")
    else:
        thinkplot.Show(title="Total Crimes vs " + x + " Crimes",
                       xlabel=x + " Crimes",
                       ylabel="Total Crimes")
        print(x + " crime stats")
        print("Spearman's correlation:",
              thinkstats2.SpearmanCorr(tot_crimes, df[x]))
        print("Covariance:", thinkstats2.Cov(tot_crimes, df[x]))
        print()
Ejemplo n.º 7
0
def ComputeLeastSquares(ages, weights):
    """Computes least squares fit for ages and weights.

    Prints summary statistics.
    """
    # compute the correlation between age and weight
    print 'Pearson correlation', thinkstats2.Corr(ages, weights)
    print 'Spearman correlation', thinkstats2.SpearmanCorr(ages, weights)

    # compute least squares fit
    inter, slope = thinkstats2.LeastSquares(ages, weights)
    print '(inter, slope):', inter, slope

    res = thinkstats2.Residuals(ages, weights, inter, slope)
    R2 = thinkstats2.CoefDetermination(weights, res)

    print 'R^2', R2
    print
    return inter, slope, R2
Ejemplo n.º 8
0
def main():
    random.seed(17)

    rho = -0.8
    res = CorrelatedGenerator(1000, rho)
    xs, ys = zip(*res)

    a = 1.0
    b = 0.0
    xs = [a * x + b for x in xs]

    print 'mean, var of x', thinkstats2.MeanVar(xs)
    print 'mean, var of y', thinkstats2.MeanVar(ys)
    print 'covariance', thinkstats2.Cov(xs, ys)
    print 'Pearson corr', thinkstats2.Corr(xs, ys)
    print 'Spearman corr', thinkstats2.SpearmanCorr(xs, ys)

    thinkplot.Scatter(xs, ys)
    thinkplot.Show()
Ejemplo n.º 9
0
def ComputeAirlineArrivalDelayCorrelations(flights):
    """Compute the different correlations.
        This is similar to Correlations() in scatter.py
    """
    flights = flights.dropna(subset=['AIRLINE', 'ARRIVAL_DELAY'])
    print('pandas cov', flights.AIRLINE_CODE.cov(flights.ARRIVAL_DELAY))
    print('thinkstats2 Cov',
          thinkstats2.Cov(flights.AIRLINE_CODE, flights.ARRIVAL_DELAY))
    print()

    print('pandas corr Pearson',
          flights.AIRLINE_CODE.corr(flights.ARRIVAL_DELAY))
    print('thinkstats2 Corr Pearson',
          thinkstats2.Corr(flights.AIRLINE_CODE, flights.ARRIVAL_DELAY))
    print()

    print('pandas corr spearman',
          flights.AIRLINE_CODE.corr(flights.ARRIVAL_DELAY, method='spearman'))
    print(
        'thinkstats2 SpearmanCorr',
        thinkstats2.SpearmanCorr(flights.AIRLINE_CODE, flights.ARRIVAL_DELAY))
    print()
Ejemplo n.º 10
0
age_means = [g.agepreg.mean() for i, g in groups]
wgt_cdfs = [thinkstats2.Cdf(g.totalwgt_lb) for i, g in groups]

percentiles = [75, 50, 25]
thinkplot.PrePlot(len(percentiles))
for percent in percentiles:
    wgt_percentile = [cdf.Percentile(percent) for cdf in wgt_cdfs]
    label = '%dth' % percent
    thinkplot.Plot(age_means, wgt_percentile, label=label)
thinkplot.Config(xlabel='Mother age (years)',
                 ylabel='Birth weight (lbs)',
                 legend=True)

p_corr = thinkstats2.Corr(live_ss.agepreg, live_ss.totalwgt_lb)
s_corr = thinkstats2.SpearmanCorr(live_ss.agepreg, live_ss.totalwgt_lb)
print('Pearson\'s Correlation:', p_corr)
print('Spearman\'s Correlation:', s_corr)


#--- Chapter8 Ex2
def SimulateSample(lam=2, n=10, iters=1000):
    lams_est = []
    for m in np.arange(iters):
        xs = np.random.exponential(1.0 / lam, n)
        L = 1 / np.mean(xs)
        lams_est.append(L)
    return lams_est


def SampleDistrPLot(estimates, n, lam):
Ejemplo n.º 11
0
    return greq, less


def SplitFrames(df):
    df = df.dropna(subset=['agepreg', 'totalwgt_lb'])
    age = df.agepreg
    wgt = df.totalwgt_lb
    return age, wgt


def PlotScatter(age, wgt, xmin, xmax, ymin, ymax):
    thinkplot.Scatter(age, wgt, alpha=1.0)
    thinkplot.Config(xlabel='Age (Years)',
                     ylabel='Birth Weight (lbs)',
                     xlim=[xmin, xmax],
                     ylim=[ymin, ymax],
                     legend=False)
    thinkplot.Show()


greq, less = MakeFrames()
greqage, greqwgt = SplitFrames(greq)
lessage, lesswgt = SplitFrames(less)
PlotScatter(greqage, greqwgt, 30, 50, 0, 14)
PlotScatter(lessage, lesswgt, 5, 30, 0, 14)
print "Greq 30 Pearson's corr:", thinkstats2.Corr(greqage, greqwgt)
print "Greq 30 Spearman corr:", thinkstats2.SpearmanCorr(greqage, greqwgt)
print "Less 30 Pearson's corr:", thinkstats2.Corr(lessage, lesswgt)
print "Less 30 Spearman corr:", thinkstats2.SpearmanCorr(lessage, lesswgt)
Ejemplo n.º 12
0
def CorrelationPlots(df,
                     xlabel,
                     ylabel,
                     xjitter=0,
                     yjitter=0,
                     axis=None,
                     nbins=5,
                     **options):

    cleaned = df.dropna(subset=[xlabel, ylabel])
    xs = cleaned[xlabel]
    ys = cleaned[ylabel]

    xs = thinkstats2.Jitter(xs, xjitter)
    ys = thinkstats2.Jitter(ys, yjitter)

    xmin, xmax = min(xs), max(xs)
    ymin, ymax = min(ys), max(ys)
    if axis is None:
        axis = [xmin, xmax, ymin, ymax]

    PrePlot(num=4, rows=2, cols=2)

    # make scatter plot
    SubPlot(1)
    Scatter(xs, ys, alpha=0.1, s=10)
    Config(xlabel=xlabel, ylabel=ylabel, axis=axis, legend=False)

    # make HexBin plot
    SubPlot(2)
    HexBin(xs, ys)
    Config(xlabel=xlabel, ylabel=ylabel, axis=axis, legend=False)

    # plot percentiles
    SubPlot(3)

    xs_cdf = thinkstats2.Cdf(xs)
    lower = xs_cdf.Percentile(1)
    upper = xs_cdf.Percentile(99)

    bins = np.arange(lower, upper, nbins)
    indices = np.digitize(xs, bins)
    groups = cleaned.groupby(indices)
    mean_xs = [group[xlabel].mean() for i, group in groups]
    cdfs = [thinkstats2.Cdf(group[ylabel]) for i, group in groups]

    for percent in [75, 50, 25]:
        y_percentiles = [cdf.Percentile(percent) for cdf in cdfs]
        label = '%dth' % percent
        Plot(mean_xs, y_percentiles, label=label)

    Config(xlabel=xlabel, ylabel=ylabel, axis=axis, legend=True)

    # plot CDFs
    n = (upper - lower) // (nbins - 2)
    bins = np.arange(lower, upper, n)
    indices = np.digitize(cleaned[xlabel], bins)
    groups = cleaned.groupby(indices)
    mean_xs = [group[xlabel].mean() for i, group in groups]
    cdfs = [thinkstats2.Cdf(group[ylabel]) for i, group in groups]

    ## plot the cdfs
    SubPlot(4)
    PrePlot(len(cdfs))
    for i, cdf in enumerate(cdfs):
        if i == 0:
            label = '<%d ' % bins[0] + xlabel
        elif i == len(cdfs) - 1:
            label = '>%d ' % bins[-1] + xlabel
        else:
            label = '%d - %d ' % (bins[i - 1], bins[i]) + xlabel
        Cdf(cdf, label=label)
        Config(xlabel=ylabel, ylabel='CDF', legend=True)

    #print statistics
    print('Correlation:\n', thinkstats2.Corr(xs, ys))
    print('Spearman Correlation Coefficient:\n',
          thinkstats2.SpearmanCorr(xs, ys))
 def TestStatistic(self, data):
     xs, ys = data
     test_stat = abs(thinkstats2.SpearmanCorr(xs, ys))
     return test_stat
# Summary Stats of all variables
summ_stats(df.columns[1:])

# Generating PMFs for Total crimes of all times and past 5 years
first_pmf = thinkstats2.Pmf(df.Total_crimes, label="PMF Crimes (2003-2020)")
second_pmf = thinkstats2.Pmf(df.Total_crimes[-60:, ],
                             label="PMF Crimes Last 5 Years")

# Plotting the PMFs
ShowPMF(first_pmf, second_pmf)

# Normal Probability Plots for variables
MakeNormalPlot('Hooligan')
MakeNormalPlot('Drugs')

# Variables for the Scatterplot
scatter('Serious')
scatter('Theft')
scatter('month')

# Correlation Matrix for all variables
print(df.corr(method='spearman'))

# Testing the p-value for correlation
print(thinkstats2.SpearmanCorr(df.Theft, df.Serious))
corr_test()

# Creating a Regression Model
Regress('Theft')
Regress('Serious')
Ejemplo n.º 15
0
import thinkstats2
import thinkplot
import first
import numpy as np

live, firsts, others = first.MakeFrames()
live = live.dropna(subset=['agepreg', 'totalwgt_lb'])

rho = thinkstats2.Corr(live.agepreg, live.totalwgt_lb)
rho_s = thinkstats2.SpearmanCorr(live.agepreg, live.totalwgt_lb)
print('Pearson\'s Correlation, Mother\'s age and Birth weight: ', rho)
print('Spearman\'s Rank Correlation, Mother\'s age and Birth weight: ', rho_s)

thinkplot.LEGEND = False
thinkplot.Scatter(live.agepreg, live.totalwgt_lb)
#thinkplot.Show(xlabel = 'Mother\'s age', ylabel = 'Birth weight')
thinkplot.SaveFormat(root='age_weight_scatter',
                     fmt='png',
                     xlabel='Mothers\'s age',
                     ylabel='Birth weight')

thinkplot.LEGEND = True
bins = np.arange(10, 45, 2.5)
indices = np.digitize(live.agepreg, bins)
groups = live.groupby(indices)
ages = [group.agepreg.mean() for i, group in groups]
cdfs = [thinkstats2.Cdf(group.totalwgt_lb) for i, group in groups]
for percent in [75, 50, 25]:
    weights = [cdf.Percentile(percent) for cdf in cdfs]
    label = '%dth' % percent
    thinkplot.Plot(ages, weights, label=label)