def testCov(self): t = [0, 4, 7, 3, 8, 1, 6, 2, 9, 5] a = np.array(t) t2 = [5, 4, 3, 0, 8, 9, 7, 6, 2, 1] self.assertAlmostEqual(thinkstats2.Cov(t, a), 8.25) self.assertAlmostEqual(thinkstats2.Cov(t, -a), -8.25) self.assertAlmostEqual(thinkstats2.Corr(t, a), 1) self.assertAlmostEqual(thinkstats2.Corr(t, -a), -1) self.assertAlmostEqual(thinkstats2.Corr(t, t2), -0.1878787878) self.assertAlmostEqual(thinkstats2.SpearmanCorr(t, -a), -1) self.assertAlmostEqual(thinkstats2.SpearmanCorr(t, t2), -0.1878787878)
def ComputeCorrelations(heights, weights): """Compute correlations and least squares fit. heights: sequence weights: sequence """ pearson = thinkstats2.Corr(heights, weights) assert almostEquals(pearson, 0.508736478973) print('Pearson correlation (weights):', pearson) log_weights = np.log(weights) log_pearson = thinkstats2.Corr(heights, log_weights) assert almostEquals(log_pearson, 0.531728260598) print('Pearson correlation (log weights):', log_pearson) spearman = thinkstats2.SpearmanCorr(heights, weights) print('Spearman correlation (weights):', spearman) assert almostEquals(spearman, 0.541535836332) inter, slope = thinkstats2.LeastSquares(heights, log_weights) print('Least squares inter, slope (log weights):', inter, slope) res = thinkstats2.Residuals(heights, log_weights, inter, slope) R2 = thinkstats2.CoefDetermination(log_weights, res) R = math.sqrt(R2) print('Coefficient of determination:', R2) print('sqrt(R^2):', R) assert almostEquals(R, log_pearson)
def Correlations(df): print('pandas cov', df.htm3.cov(df.wtkg2)) #print('NumPy cov', np.cov(df.htm3, df.wtkg2, ddof=0)) print('thinkstats2 Cov', thinkstats2.Cov(df.htm3, df.wtkg2)) print() print('pandas corr', df.htm3.corr(df.wtkg2)) #print('NumPy corrcoef', np.corrcoef(df.htm3, df.wtkg2, ddof=0)) print('thinkstats2 Corr', thinkstats2.Corr(df.htm3, df.wtkg2)) print() print('pandas corr spearman', df.htm3.corr(df.wtkg2, method='spearman')) print('thinkstats2 SpearmanCorr', thinkstats2.SpearmanCorr(df.htm3, df.wtkg2)) print('thinkstats2 SpearmanCorr log wtkg3', thinkstats2.SpearmanCorr(df.htm3, np.log(df.wtkg2))) print() print('thinkstats2 Corr log wtkg3', thinkstats2.Corr(df.htm3, np.log(df.wtkg2))) print()
def main(script): thinkstats2.RandomSeed(17) live, firsts, others = first.MakeFrames() live = live.dropna(subset=['agepreg', 'totalwgt_lb']) BinnedPercentiles(live) ages = live.agepreg weights = live.totalwgt_lb print('thinkstats2 Corr', thinkstats2.Corr(ages, weights)) print('thinkstats2 SpearmanCorr', thinkstats2.SpearmanCorr(ages, weights)) ScatterPlot(ages, weights, alpha=0.1) thinkplot.Save(root='chap07scatter1', legend=False, formats=['jpg'])
def main(name, data_dir='.'): xs, ys = ReadData(data_dir) thinkplot.Scatter(xs, ys, alpha=0.05) thinkplot.Save(root='correlate1', xlabel='Age (years)', ylabel='Birth weight (oz)', axis=[9, 45, 0, 250]) print 'Pearson', thinkstats2.Corr(xs, ys) print 'Spearman', thinkstats2.SpearmanCorr(xs, ys) for i in range(10): print SimulateNull(list(xs), list(ys)) print PValue(xs, ys, 1000)
def scatter(x): tot_crimes = df.Total_crimes thinkplot.Scatter(df[x], tot_crimes, alpha=.5) if x == 'month': thinkplot.Show(title="Total Crimes vs Time", xlabel="Year", ylabel="Total Crimes") else: thinkplot.Show(title="Total Crimes vs " + x + " Crimes", xlabel=x + " Crimes", ylabel="Total Crimes") print(x + " crime stats") print("Spearman's correlation:", thinkstats2.SpearmanCorr(tot_crimes, df[x])) print("Covariance:", thinkstats2.Cov(tot_crimes, df[x])) print()
def ComputeLeastSquares(ages, weights): """Computes least squares fit for ages and weights. Prints summary statistics. """ # compute the correlation between age and weight print 'Pearson correlation', thinkstats2.Corr(ages, weights) print 'Spearman correlation', thinkstats2.SpearmanCorr(ages, weights) # compute least squares fit inter, slope = thinkstats2.LeastSquares(ages, weights) print '(inter, slope):', inter, slope res = thinkstats2.Residuals(ages, weights, inter, slope) R2 = thinkstats2.CoefDetermination(weights, res) print 'R^2', R2 print return inter, slope, R2
def main(): random.seed(17) rho = -0.8 res = CorrelatedGenerator(1000, rho) xs, ys = zip(*res) a = 1.0 b = 0.0 xs = [a * x + b for x in xs] print 'mean, var of x', thinkstats2.MeanVar(xs) print 'mean, var of y', thinkstats2.MeanVar(ys) print 'covariance', thinkstats2.Cov(xs, ys) print 'Pearson corr', thinkstats2.Corr(xs, ys) print 'Spearman corr', thinkstats2.SpearmanCorr(xs, ys) thinkplot.Scatter(xs, ys) thinkplot.Show()
def ComputeAirlineArrivalDelayCorrelations(flights): """Compute the different correlations. This is similar to Correlations() in scatter.py """ flights = flights.dropna(subset=['AIRLINE', 'ARRIVAL_DELAY']) print('pandas cov', flights.AIRLINE_CODE.cov(flights.ARRIVAL_DELAY)) print('thinkstats2 Cov', thinkstats2.Cov(flights.AIRLINE_CODE, flights.ARRIVAL_DELAY)) print() print('pandas corr Pearson', flights.AIRLINE_CODE.corr(flights.ARRIVAL_DELAY)) print('thinkstats2 Corr Pearson', thinkstats2.Corr(flights.AIRLINE_CODE, flights.ARRIVAL_DELAY)) print() print('pandas corr spearman', flights.AIRLINE_CODE.corr(flights.ARRIVAL_DELAY, method='spearman')) print( 'thinkstats2 SpearmanCorr', thinkstats2.SpearmanCorr(flights.AIRLINE_CODE, flights.ARRIVAL_DELAY)) print()
age_means = [g.agepreg.mean() for i, g in groups] wgt_cdfs = [thinkstats2.Cdf(g.totalwgt_lb) for i, g in groups] percentiles = [75, 50, 25] thinkplot.PrePlot(len(percentiles)) for percent in percentiles: wgt_percentile = [cdf.Percentile(percent) for cdf in wgt_cdfs] label = '%dth' % percent thinkplot.Plot(age_means, wgt_percentile, label=label) thinkplot.Config(xlabel='Mother age (years)', ylabel='Birth weight (lbs)', legend=True) p_corr = thinkstats2.Corr(live_ss.agepreg, live_ss.totalwgt_lb) s_corr = thinkstats2.SpearmanCorr(live_ss.agepreg, live_ss.totalwgt_lb) print('Pearson\'s Correlation:', p_corr) print('Spearman\'s Correlation:', s_corr) #--- Chapter8 Ex2 def SimulateSample(lam=2, n=10, iters=1000): lams_est = [] for m in np.arange(iters): xs = np.random.exponential(1.0 / lam, n) L = 1 / np.mean(xs) lams_est.append(L) return lams_est def SampleDistrPLot(estimates, n, lam):
return greq, less def SplitFrames(df): df = df.dropna(subset=['agepreg', 'totalwgt_lb']) age = df.agepreg wgt = df.totalwgt_lb return age, wgt def PlotScatter(age, wgt, xmin, xmax, ymin, ymax): thinkplot.Scatter(age, wgt, alpha=1.0) thinkplot.Config(xlabel='Age (Years)', ylabel='Birth Weight (lbs)', xlim=[xmin, xmax], ylim=[ymin, ymax], legend=False) thinkplot.Show() greq, less = MakeFrames() greqage, greqwgt = SplitFrames(greq) lessage, lesswgt = SplitFrames(less) PlotScatter(greqage, greqwgt, 30, 50, 0, 14) PlotScatter(lessage, lesswgt, 5, 30, 0, 14) print "Greq 30 Pearson's corr:", thinkstats2.Corr(greqage, greqwgt) print "Greq 30 Spearman corr:", thinkstats2.SpearmanCorr(greqage, greqwgt) print "Less 30 Pearson's corr:", thinkstats2.Corr(lessage, lesswgt) print "Less 30 Spearman corr:", thinkstats2.SpearmanCorr(lessage, lesswgt)
def CorrelationPlots(df, xlabel, ylabel, xjitter=0, yjitter=0, axis=None, nbins=5, **options): cleaned = df.dropna(subset=[xlabel, ylabel]) xs = cleaned[xlabel] ys = cleaned[ylabel] xs = thinkstats2.Jitter(xs, xjitter) ys = thinkstats2.Jitter(ys, yjitter) xmin, xmax = min(xs), max(xs) ymin, ymax = min(ys), max(ys) if axis is None: axis = [xmin, xmax, ymin, ymax] PrePlot(num=4, rows=2, cols=2) # make scatter plot SubPlot(1) Scatter(xs, ys, alpha=0.1, s=10) Config(xlabel=xlabel, ylabel=ylabel, axis=axis, legend=False) # make HexBin plot SubPlot(2) HexBin(xs, ys) Config(xlabel=xlabel, ylabel=ylabel, axis=axis, legend=False) # plot percentiles SubPlot(3) xs_cdf = thinkstats2.Cdf(xs) lower = xs_cdf.Percentile(1) upper = xs_cdf.Percentile(99) bins = np.arange(lower, upper, nbins) indices = np.digitize(xs, bins) groups = cleaned.groupby(indices) mean_xs = [group[xlabel].mean() for i, group in groups] cdfs = [thinkstats2.Cdf(group[ylabel]) for i, group in groups] for percent in [75, 50, 25]: y_percentiles = [cdf.Percentile(percent) for cdf in cdfs] label = '%dth' % percent Plot(mean_xs, y_percentiles, label=label) Config(xlabel=xlabel, ylabel=ylabel, axis=axis, legend=True) # plot CDFs n = (upper - lower) // (nbins - 2) bins = np.arange(lower, upper, n) indices = np.digitize(cleaned[xlabel], bins) groups = cleaned.groupby(indices) mean_xs = [group[xlabel].mean() for i, group in groups] cdfs = [thinkstats2.Cdf(group[ylabel]) for i, group in groups] ## plot the cdfs SubPlot(4) PrePlot(len(cdfs)) for i, cdf in enumerate(cdfs): if i == 0: label = '<%d ' % bins[0] + xlabel elif i == len(cdfs) - 1: label = '>%d ' % bins[-1] + xlabel else: label = '%d - %d ' % (bins[i - 1], bins[i]) + xlabel Cdf(cdf, label=label) Config(xlabel=ylabel, ylabel='CDF', legend=True) #print statistics print('Correlation:\n', thinkstats2.Corr(xs, ys)) print('Spearman Correlation Coefficient:\n', thinkstats2.SpearmanCorr(xs, ys))
def TestStatistic(self, data): xs, ys = data test_stat = abs(thinkstats2.SpearmanCorr(xs, ys)) return test_stat
# Summary Stats of all variables summ_stats(df.columns[1:]) # Generating PMFs for Total crimes of all times and past 5 years first_pmf = thinkstats2.Pmf(df.Total_crimes, label="PMF Crimes (2003-2020)") second_pmf = thinkstats2.Pmf(df.Total_crimes[-60:, ], label="PMF Crimes Last 5 Years") # Plotting the PMFs ShowPMF(first_pmf, second_pmf) # Normal Probability Plots for variables MakeNormalPlot('Hooligan') MakeNormalPlot('Drugs') # Variables for the Scatterplot scatter('Serious') scatter('Theft') scatter('month') # Correlation Matrix for all variables print(df.corr(method='spearman')) # Testing the p-value for correlation print(thinkstats2.SpearmanCorr(df.Theft, df.Serious)) corr_test() # Creating a Regression Model Regress('Theft') Regress('Serious')
import thinkstats2 import thinkplot import first import numpy as np live, firsts, others = first.MakeFrames() live = live.dropna(subset=['agepreg', 'totalwgt_lb']) rho = thinkstats2.Corr(live.agepreg, live.totalwgt_lb) rho_s = thinkstats2.SpearmanCorr(live.agepreg, live.totalwgt_lb) print('Pearson\'s Correlation, Mother\'s age and Birth weight: ', rho) print('Spearman\'s Rank Correlation, Mother\'s age and Birth weight: ', rho_s) thinkplot.LEGEND = False thinkplot.Scatter(live.agepreg, live.totalwgt_lb) #thinkplot.Show(xlabel = 'Mother\'s age', ylabel = 'Birth weight') thinkplot.SaveFormat(root='age_weight_scatter', fmt='png', xlabel='Mothers\'s age', ylabel='Birth weight') thinkplot.LEGEND = True bins = np.arange(10, 45, 2.5) indices = np.digitize(live.agepreg, bins) groups = live.groupby(indices) ages = [group.agepreg.mean() for i, group in groups] cdfs = [thinkstats2.Cdf(group.totalwgt_lb) for i, group in groups] for percent in [75, 50, 25]: weights = [cdf.Percentile(percent) for cdf in cdfs] label = '%dth' % percent thinkplot.Plot(ages, weights, label=label)