def MakeNormalPlot(weights): mean = weights.mean() std = weights.std() xs = [-4, 4] fxs, fys = thinkstats2.FitLine(xs, inter=mean, slope=std) return fxs, fys
def PlotConfidenceIntervals(xs, inters, slopes, res=None, percent=90, **options): """Plots the 90% confidence intervals for weights based on ages. xs: sequence inters: estimated intercepts slopes: estimated slopes res: residuals percent: what percentile range to show """ fys_seq = [] for inter, slope in zip(inters, slopes): fxs, fys = thinkstats2.FitLine(xs, inter, slope) if res is not None: fys += np.random.permutation(res) fys_seq.append(fys) p = (100 - percent) / 2 percents = p, 100 - p low, high = thinkstats2.PercentileRows(fys_seq, percents) thinkplot.FillBetween(fxs, low, high, **options)
def PlotArrivalDepartureDelayFit(flights): """Plots a scatter plot and fitted curve. live: DataFrame """ sample = thinkstats2.SampleRows(flights, 1000) arrivalDelays = sample.ARRIVAL_DELAY departureDelays = sample.DEPARTURE_DELAY inter, slope = thinkstats2.LeastSquares(arrivalDelays, departureDelays) fit_xs, fit_ys = thinkstats2.FitLine(arrivalDelays, inter, slope) thinkplot.Scatter(arrivalDelays, departureDelays, color='gray', alpha=0.1) thinkplot.Plot(fit_xs, fit_ys, color='white', linewidth=3) thinkplot.Plot(fit_xs, fit_ys, color='blue', linewidth=2) thinkplot.Save( root='ArrivalDepartureDelayFit_linear1', xlabel='arrival delay (min)', ylabel='departure delay (min)', # axis=[10, 45, 0, 15], legend=False) formula = 'DEPARTURE_DELAY ~ ARRIVAL_DELAY' model = smf.ols(formula, data=sample) results = model.fit() regression.SummarizeResults(results)
def PlotConfidenceIntervals(xs, inters, slopes, percent=90, **options): fys_seq = [] for inter, slope in zip(inters, slopes): fxs, fys = thinkstats2.FitLine(xs, inter, slope) fys_seq.append(fys) p = (100 - percent) / 2 percents = p, 100 - p low, high = thinkstats2.PercentileRows(fys_seq, percents) thinkplot.FillBetween(fxs, low, high, **options)
def main(name, data_dir='.'): random.seed(17) xs, ys = ReadData(data_dir) inter = thinkstats2.Mean(ys) slope = 0 fxs, fys = thinkstats2.FitLine(xs, inter, slope) res = thinkstats2.Residuals(xs, ys, inter, slope) inter_cdf, slope_cdf = SamplingDistributions(fxs, fys, res, n=1000) thinkplot.Cdf(slope_cdf) thinkplot.Save(root='regress1', xlabel='Estimated slope (oz/year)', ylabel='CDF', title='Sampling distribution') return inter, slope = thinkstats2.LeastSquares(xs, ys) print 'inter', inter print 'slope', slope fxs, fys = thinkstats2.FitLine(xs, inter, slope) i = len(fxs) / 2 print 'median weight, age', fxs[i], fys[i] res = thinkstats2.Residuals(xs, ys, inter, slope) R2 = thinkstats2.CoefDetermination(ys, res) print 'R2', R2 print 'R', math.sqrt(R2) #thinkplot.Plot(fxs, fys, color='gray', alpha=0.5) #thinkplot.Scatter(xs, ys, alpha=0.05) #thinkplot.Show() inter_cdf, slope_cdf = SamplingDistributions(fxs, fys, res, n=1000) thinkplot.Cdf(slope_cdf) thinkplot.Save(root='regress1', xlabel='Estimated slope (oz/year)', ylabel='CDF', title='Sampling distribution')
def PlotNormalProbability(sample, title="", ylabel=""): mu, var = thinkstats2.TrimmedMeanVar(sample, p=0.01) sigma = np.sqrt(var) xs = [-5, 5] fxs, fys = thinkstats2.FitLine(xs, inter=mu, slope=sigma) thinkplot.plot(fxs, fys, color='gray', label=r'model $\mu$={:.2f} $\sigma$={:.2f}'.format( mu, sigma)) xs, ys = thinkstats2.NormalProbability(sample) thinkplot.Plot(xs, ys, label="actual") thinkplot.Config(title=title, xlabel="z", ylabel=ylabel)
def MakeNormalPlot(weights): """Generates a normal probability plot of birth weights. weights: sequence """ mean, var = thinkstats2.TrimmedMeanVar(weights, p=0.01) std = math.sqrt(var) xs = [-5, 5] xs, ys = thinkstats2.FitLine(xs, mean, std) thinkplot.Plot(xs, ys, color='0.8', label='model') xs, ys = thinkstats2.NormalProbability(weights) thinkplot.Plot(xs, ys, label='weights')
def MakeNormalPlot(x): """Generates a normal probability plot of birth weights.""" mean, var = thinkstats2.TrimmedMeanVar(df[x], p=0.01) std = math.sqrt(var) xs = [-4, 4] fxs, fys = thinkstats2.FitLine(xs, mean, std) thinkplot.Plot(fxs, fys, linewidth=4, color='0.8') thinkplot.PrePlot(2) xs, ys = thinkstats2.NormalProbability(df[x]) thinkplot.Plot(xs, ys, label='Number of Crimes') thinkplot.Show(title='Normal Prob Plot: {}'.format(x), xlabel='Standard deviations from mean', ylabel='Number of Crimes')
def PlotFit(live): """Plots a scatter plot and fitted curve. live: DataFrame """ ages = live.agepreg weights = live.totalwgt_lb inter, slope = thinkstats2.LeastSquares(ages, weights) fit_xs, fit_ys = thinkstats2.FitLine(ages, inter, slope) thinkplot.Scatter(ages, weights, color='gray', alpha=0.1) thinkplot.Plot(fit_xs, fit_ys, color='white', linewidth=3) thinkplot.Plot(fit_xs, fit_ys, color='blue', linewidth=2) thinkplot.Save(root='linear1', xlabel='age (years)', ylabel='birth weight (lbs)', axis=[10, 45, 0, 15], legend=False)
def MakeNormalPlot(arrivalDelays): """Generate the normal probability plot for the arrival delays. This is a modified copy from analytic.py """ mean = arrivalDelays.mean() std = arrivalDelays.std() xs = [-4, 4] fxs, fys = thinkstats2.FitLine(xs, mean, std) thinkplot.Plot(fxs, fys, linewidth=4, color='0.8') thinkplot.PrePlot(2) xs, ys = thinkstats2.NormalProbability(arrivalDelays) thinkplot.Plot(xs, ys, label='arrival delays (min)') thinkplot.Save(root='NormalModel_arrivaldelay_normalplot', title='Normal probability plot', xlabel='Standard deviations from mean', ylabel='Arrival Delays (min)')
def MakeNormalPlot(weights, term_weights): """Generates a normal probability plot of birth weights.""" mean, var = thinkstats2.TrimmedMeanVar(weights, p=0.01) std = math.sqrt(var) xs = [-4, 4] fxs, fys = thinkstats2.FitLine(xs, mean, std) thinkplot.Plot(fxs, fys, linewidth=4, color='0.8') thinkplot.PrePlot(2) xs, ys = thinkstats2.NormalProbability(weights) thinkplot.Plot(xs, ys, label='all live') xs, ys = thinkstats2.NormalProbability(term_weights) thinkplot.Plot(xs, ys, label='full term') thinkplot.Save(root='analytic_birthwgt_normal', title='Normal probability plot', xlabel='Standard deviations from mean', ylabel='Birth weight (lbs)')
def PlotConfidenceIntervals(xs, inters, slopes, res=None, percent=90, **options): """Plots the 90% confidence intervals for weights based on ages. xs: sequence inters: estimated intercepts slopes: estimated slopes res: residuals percent: what percentile range to show """ size = len(slopes), len(xs) array = np.zeros(size) for i, (inter, slope) in enumerate(zip(inters, slopes)): fxs, fys = thinkstats2.FitLine(xs, inter, slope) if res is not None: fys += np.random.permutation(res) array[i, ] = fys array = np.sort(array, axis=0) def Percentile(p): """Selects the line from array that corresponds to percentile p. p: float 0--100 returns: NumPy array (one row) """ index = int(len(slopes) * p / 100) return array[index, ] p = (100 - percent) / 2 #low = thinkstats2.Smooth(Percentile(p)) #high = thinkstats2.Smooth(Percentile(100-p)) low = Percentile(p) high = Percentile(100 - p) thinkplot.FillBetween(fxs, low, high, **options)
def main(): random.seed(17) rho = 0.8 xs, ys = SatIqData(1000, rho) print 'mean, var of x', thinkstats2.MeanVar(xs) print 'mean, var of y', thinkstats2.MeanVar(ys) print 'Pearson corr', thinkstats2.Corr(xs, ys) inter, slope = thinkstats2.LeastSquares(xs, ys) print 'inter', inter print 'slope', slope fxs, fys = thinkstats2.FitLine(xs, inter, slope) res = thinkstats2.Residuals(xs, ys, inter, slope) R2 = thinkstats2.CoefDetermination(ys, res) print 'R2', R2 thinkplot.Plot(fxs, fys, color='gray', alpha=0.2) thinkplot.Scatter(xs, ys) thinkplot.Show()
label = '$\mu=%d$, $\sigma=%d$' % (mu, sigma) thinkplot.Plot(xs, ys, label=label) thinkplot.Config(title='Normal probability plot', xlabel='standard normal sample', ylabel='sample values') #%% [markdown] # Here's the normal probability plot for birth weights, showing that the lightest babies are lighter than we expect from the normal mode, and the heaviest babies are heavier. #%% mean, var = thinkstats2.TrimmedMeanVar(weights, p=0.01) std = np.sqrt(var) xs = [-4, 4] fxs, fys = thinkstats2.FitLine(xs, mean, std) thinkplot.Plot(fxs, fys, linewidth=4, color='0.8') xs, ys = thinkstats2.NormalProbability(weights) thinkplot.Plot(xs, ys, label='all live') thinkplot.Config(title='Normal probability plot', xlabel='Standard deviations from mean', ylabel='Birth weight (lbs)') #%% [markdown] # If we suspect that the deviation in the left tail is due to preterm babies, we can check by selecting only full term births. #%% full_term = preg[preg.prglngth >= 37] term_weights = full_term.totalwgt_lb.dropna()
n = 1000 thinkplot.PrePlot(3) for mu, sigma in zip([0, 1, 5], [1, 1, 2]): sample = np.random.normal(mu, sigma, n) xs, ys = thinkstats2.NormalProbability(sample) thinkplot.plot(xs, ys, label=r"$\mu$={} $\sigma$={}".format(mu, sigma)) thinkplot.Config(title="Normal probability plot", xlabel="standard normal sample", ylabel="sample value") #%% mu, var = thinkstats2.TrimmedMeanVar(totalwgt_lb, p=0.01) maturity = live[live.prglngth >= 37].totalwgt_lb.dropna() sigma = np.sqrt(var) xs = [-4, 4] fxs, fys = thinkstats2.FitLine(xs, inter=mu, slope=sigma) thinkplot.plot(fxs, fys, color='gray', label=r'model $\mu$={:.2f} $\sigma$={:.2f}'.format(mu, sigma)) xs, ys = thinkstats2.NormalProbability(totalwgt_lb) thinkplot.Plot(xs, ys, label="all") xs, ys = thinkstats2.NormalProbability(maturity) thinkplot.Plot(xs, ys, label="maturity") thinkplot.Config() #%% [markdown] # ## Lognormal distribution #%% import brfss
width = 0.45 thinkplot.PrePlot(2) thinkplot.Hist(male_pmf, width=width, align='left', color='blue') thinkplot.Hist(female_pmf, width=width, align='right', color='red') thinkplot.Config(ylabel='Probability') #plot CDF cdf = thinkstats2.Cdf(df.Age) thinkplot.Cdf(cdf) thinkplot.Config(xlabel='Age', ylabel='CDF') #plot normal distribution mean = df.Age.mean() std = df.Age.std() xs = [-4, 4] fxs, fys = thinkstats2.FitLine(xs, inter=mean, slope=std) thinkplot.Plot(fxs, fys, color='gray', label='model') xs, ys = thinkstats2.NormalProbability(df.Age) thinkplot.Plot(xs, ys, label='Age') #scatter plots and correlation #year vs. age year = thinkstats2.Jitter(df.Year, .25) thinkplot.Scatter(year, df.Age) thinkplot.Show(xlabel='Year', ylabel='Age') thinkstats2.Corr(df.Year, df.Age) #drug vs. age thinkplot.Scatter(df.Age, df.Drug) thinkplot.Show(xlabel='Age', ylabel='Drug') #testing a difference in gender
weights = data.wtkg2 heights = data.htm3 # get log weight logWeight = np.log10(weights) #%% # Estimate intercept and slope inter, slope = thinkstats2.LeastSquares(heights, logWeight) print("intercept: {:.3f} \n slope: {:.3f}".format(inter, slope)) #%% # show scatter plot of fitted line thinkplot.Scatter(heights, logWeight, alpha=0.01, s=5) fxs, fys = thinkstats2.FitLine(heights, inter, slope) thinkplot.Plot(fxs, fys, color='red') thinkplot.Config(xlabel='Height (cm)', ylabel='log10 weight (kg)') #%% # get the residuals res = thinkstats2.Residuals(heights, logWeight, inter, slope) data['residual'] = res #%% # set up bins, indicies, and groups to calc mean and cdf bins = np.arange(130, 210, 5) indices = np.digitize(data.htm3, bins) groups = data.groupby(indices) means = [group.htm3.mean() for i, group in groups][1:-1]