def GetArrivalDepartureDelay(flights, hjitter=0.0, wjitter=0.0): """Get sequences of airports and arrival delays. df: hjitter: float magnitude of random noise added to heights wjitter: float magnitude of random noise added to weights returns: tuple of sequences (airport, arrivaldelays) """ arrivalDelays = flights.ARRIVAL_DELAY if hjitter: arrivalDelays = thinkstats2.Jitter(arrivalDelays, hjitter) departureDelays = flights.DEPARTURE_DELAY if wjitter: departureDelays = thinkstats2.Jitter(departureDelays, wjitter) return arrivalDelays, departureDelays
def GetHeightWeight(df, hjitter=0.0, wjitter=0.0): """Get sequences of height and weight. df: DataFrame with htm3 and wtkg2 hjitter: float magnitude of random noise added to heights wjitter: float magnitude of random noise added to weights returns: tuple of sequences (heights, weights) """ heights = df.htm3 if hjitter: heights = thinkstats2.Jitter(heights, hjitter) weights = df.wtkg2 if wjitter: weights = thinkstats2.Jitter(weights, wjitter) return heights, weights
cdf = thinkstats2.Cdf(df.Age) thinkplot.Cdf(cdf) thinkplot.Config(xlabel='Age', ylabel='CDF') #plot normal distribution mean = df.Age.mean() std = df.Age.std() xs = [-4, 4] fxs, fys = thinkstats2.FitLine(xs, inter=mean, slope=std) thinkplot.Plot(fxs, fys, color='gray', label='model') xs, ys = thinkstats2.NormalProbability(df.Age) thinkplot.Plot(xs, ys, label='Age') #scatter plots and correlation #year vs. age year = thinkstats2.Jitter(df.Year, .25) thinkplot.Scatter(year, df.Age) thinkplot.Show(xlabel='Year', ylabel='Age') thinkstats2.Corr(df.Year, df.Age) #drug vs. age thinkplot.Scatter(df.Age, df.Drug) thinkplot.Show(xlabel='Age', ylabel='Drug') #testing a difference in gender data = male.Age.values, female.Age.values ht = DiffMeansPermute(data) pvalue = ht.PValue() print(pvalue) ht.PlotCdf() thinkplot.Config(xlabel='test statistic', ylabel='CDF')
def CorrelationPlots(df, xlabel, ylabel, xjitter=0, yjitter=0, axis=None, nbins=5, **options): cleaned = df.dropna(subset=[xlabel, ylabel]) xs = cleaned[xlabel] ys = cleaned[ylabel] xs = thinkstats2.Jitter(xs, xjitter) ys = thinkstats2.Jitter(ys, yjitter) xmin, xmax = min(xs), max(xs) ymin, ymax = min(ys), max(ys) if axis is None: axis = [xmin, xmax, ymin, ymax] PrePlot(num=4, rows=2, cols=2) # make scatter plot SubPlot(1) Scatter(xs, ys, alpha=0.1, s=10) Config(xlabel=xlabel, ylabel=ylabel, axis=axis, legend=False) # make HexBin plot SubPlot(2) HexBin(xs, ys) Config(xlabel=xlabel, ylabel=ylabel, axis=axis, legend=False) # plot percentiles SubPlot(3) xs_cdf = thinkstats2.Cdf(xs) lower = xs_cdf.Percentile(1) upper = xs_cdf.Percentile(99) bins = np.arange(lower, upper, nbins) indices = np.digitize(xs, bins) groups = cleaned.groupby(indices) mean_xs = [group[xlabel].mean() for i, group in groups] cdfs = [thinkstats2.Cdf(group[ylabel]) for i, group in groups] for percent in [75, 50, 25]: y_percentiles = [cdf.Percentile(percent) for cdf in cdfs] label = '%dth' % percent Plot(mean_xs, y_percentiles, label=label) Config(xlabel=xlabel, ylabel=ylabel, axis=axis, legend=True) # plot CDFs n = (upper - lower) // (nbins - 2) bins = np.arange(lower, upper, n) indices = np.digitize(cleaned[xlabel], bins) groups = cleaned.groupby(indices) mean_xs = [group[xlabel].mean() for i, group in groups] cdfs = [thinkstats2.Cdf(group[ylabel]) for i, group in groups] ## plot the cdfs SubPlot(4) PrePlot(len(cdfs)) for i, cdf in enumerate(cdfs): if i == 0: label = '<%d ' % bins[0] + xlabel elif i == len(cdfs) - 1: label = '>%d ' % bins[-1] + xlabel else: label = '%d - %d ' % (bins[i - 1], bins[i]) + xlabel Cdf(cdf, label=label) Config(xlabel=ylabel, ylabel='CDF', legend=True) #print statistics print('Correlation:\n', thinkstats2.Corr(xs, ys)) print('Spearman Correlation Coefficient:\n', thinkstats2.SpearmanCorr(xs, ys))