def main(): filename = 'mystery0.dat' data = read_file(filename) pmf = thinkstats2.MakePmfFromList(data) cdf = thinkstats2.MakeCdfFromList(data) pdf = thinkstats2.EstimatedPdf(data) low, high = min(data), max(data) xs = numpy.linspace(low, high, 101) kde_pmf = pdf.MakePmf(xs) bin_data = BinData(data, low, high, 51) bin_pmf = thinkstats2.MakePmfFromList(bin_data) thinkplot.SubPlot(2, 2, 1) thinkplot.Hist(pmf, width=0.1) thinkplot.Config(title='Naive Pmf') thinkplot.SubPlot(2, 2, 2) thinkplot.Hist(bin_pmf) thinkplot.Config(title='Binned Hist') thinkplot.SubPlot(2, 2, 3) thinkplot.Pmf(kde_pmf) thinkplot.Config(title='KDE PDF') thinkplot.SubPlot(2, 2, 4) thinkplot.Cdf(cdf) thinkplot.Config(title='CDF') thinkplot.Show()
def MakeFigures(df): """Generates CDFs and normal prob plots for weights and log weights.""" weights = df.wtkg2.dropna() log_weights = np.log10(weights) # plot weights on linear and log scales thinkplot.PrePlot(cols=2) MakeNormalModel(weights) thinkplot.Config(xlabel='adult weight (kg)', ylabel='CDF') thinkplot.SubPlot(2) MakeNormalModel(log_weights) thinkplot.Config(xlabel='adult weight (log10 kg)') thinkplot.Save(root='brfss_weight') # make normal probability plots on linear and log scales thinkplot.PrePlot(cols=2) MakeNormalPlot(weights) thinkplot.Config(xlabel='z', ylabel='weights (kg)') thinkplot.SubPlot(2) MakeNormalPlot(log_weights) thinkplot.Config(xlabel='z', ylabel='weights (log10 kg)') thinkplot.Save(root='brfss_weight_normal')
def MakeFigures(df): """Make scatterplots. """ sample = thinkstats2.SampleRows(df, 5000) # simple scatter plot thinkplot.PrePlot(cols=2) heights, weights = GetHeightWeight(sample) ScatterPlot(heights, weights) # scatter plot with jitter thinkplot.SubPlot(2) heights, weights = GetHeightWeight(sample, hjitter=1.3, wjitter=0.5) ScatterPlot(heights, weights) thinkplot.Save(root='scatter1') # with jitter and transparency thinkplot.PrePlot(cols=2) ScatterPlot(heights, weights, alpha=0.1) # hexbin plot thinkplot.SubPlot(2) heights, weights = GetHeightWeight(df, hjitter=1.3, wjitter=0.5) HexBin(heights, weights) thinkplot.Save(root='scatter2')
def ex3(): def VertLine(x, y=1): thinkplot.Plot([x, x], [0, y], color='0.8', linewidth=3) lam = 4 goal_totals = [SimulateGame(lam=lam) for _ in range(1000)] print('RMSE', RMSE(goal_totals, lam)) hist = thinkstats2.Hist(goal_totals) cdf = thinkstats2.Cdf(goal_totals) thinkplot.PrePlot(rows=2, cols=2) thinkplot.SubPlot(1) thinkplot.Hist(hist) thinkplot.SubPlot(2) thinkplot.Cdf(cdf) VertLine(cdf.Percentile(5)) VertLine(cdf.Percentile(95)) thinkplot.SubPlot(3) # lambda vs. rmse # rmse goes up as lambda goes up lams = range(1, 15) rmses = [RMSE([SimulateGame(lam=l) for _ in range(1000)], l) for l in lams] thinkplot.Plot(lams, rmses) thinkplot.SubPlot(4) # m vs. rmse # maybe rmse very slowly goes down as m goes up? # not at all clear that's really the case... ms = np.arange(10, 1000, 10) rmses = [RMSE([SimulateGame() for _ in range(m)], 4) for m in ms] thinkplot.Plot(ms, rmses) thinkplot.show()
def MakePdfs(greq, less): greqpdf = thinkstats2.EstimatedPdf(greq.totalwgt_lb.dropna()) lesspdf = thinkstats2.EstimatedPdf(less.totalwgt_lb.dropna()) thinkplot.PrePlot(rows=1, cols=2) thinkplot.SubPlot(1) thinkplot.Pdf(greqpdf, label='greater/equal to 30') thinkplot.Config(xlabel='Birth weight (lbs)', ylabel='PDF') thinkplot.SubPlot(2) thinkplot.Pdf(lesspdf, label='less than 30') thinkplot.Config(xlabel='Birth weight (lbs)', ylabel='PDF') thinkplot.Show()
def PlotRemainingLifetime(sf1, sf2): """Plots remaining lifetimes for pregnancy and age at first marriage. sf1: SurvivalFunction for pregnancy length sf2: SurvivalFunction for age at first marriage """ thinkplot.PrePlot(cols=2) rem_life1 = sf1.RemainingLifetime() thinkplot.Plot(rem_life1) thinkplot.Config(title='remaining pregnancy length', xlabel='weeks', ylabel='mean remaining weeks') thinkplot.SubPlot(2) func = lambda pmf: pmf.Percentile(50) rem_life2 = sf2.RemainingLifetime(filler=np.inf, func=func) thinkplot.Plot(rem_life2) thinkplot.Config(title='years until first marriage', ylim=[0, 15], xlim=[11, 31], xlabel='age (years)', ylabel='median remaining years') thinkplot.Save(root='survival6', formats=FORMATS)
def main(): df = ReadData() cdf = thinkstats2.Cdf(df['ps']) thinkplot.PrePlot(rows=1, cols=2) thinkplot.SubPlot(1) scale = thinkplot.Cdf(cdf, xscale='log') thinkplot.Config(title='logx', **scale) thinkplot.SubPlot(2) scale = thinkplot.Cdf(cdf, transform='pareto') thinkplot.Config(title='pareto', **scale) thinkplot.show(legend=False) print(df)
def MakeBabyBoom(): """Plot CDF of interarrival time on log and linear scales. """ # compute the interarrival times df = ReadBabyBoom() diffs = df.minutes.diff() cdf = thinkstats2.Cdf(diffs, label='actual') thinkplot.PrePlot(cols=2) thinkplot.Cdf(cdf) thinkplot.Config(xlabel='minutes', ylabel='CDF', legend=False) thinkplot.SubPlot(2) thinkplot.Cdf(cdf, complement=True) thinkplot.Config(xlabel='minutes', ylabel='CCDF', yscale='log', legend=False) thinkplot.Save(root='analytic_interarrivals') n = len(diffs) lam = 44 / 24 * 60.0 sample = [random.expovariate(lam) for _ in range(n)] model = thinkstats2.Cdf(sample, label='model') thinkplot.PrePlot(2) thinkplot.Cdfs([cdf, model], complement=True) thinkplot.Save(root='analytic_interarrivals_model', title='Time between births', xlabel='minutes', ylabel='CCDF', yscale='log')
def MakeHists(live): """Plot Hists for live births live: DataFrame others: DataFrame """ hist = thinkstats2.Hist(np.floor(live.agepreg), label='agepreg') thinkplot.PrePlot(2, cols=2) thinkplot.SubPlot(1) thinkplot.Hist(hist) thinkplot.Config(xlabel='years', ylabel='frequency', axis=[0, 45, 0, 700]) thinkplot.SubPlot(2) thinkplot.Pmf(hist) thinkplot.Save(root='probability_agepreg_hist', xlabel='years', axis=[0, 45, 0, 700])
def main(script, filename='mystery0.dat'): data = ReadFile(filename) cdf = thinkstats2.Cdf(data) thinkplot.PrePlot(num=6, rows=2, cols=3) thinkplot.SubPlot(1) thinkplot.Cdf(cdf, color='C0', label=filename) thinkplot.Config(title='CDF on linear scale', ylabel='CDF') thinkplot.SubPlot(2) scale = thinkplot.Cdf(cdf, xscale='log', color='C0') thinkplot.Config(title='CDF on log-x scale', ylabel='CDF', **scale) thinkplot.SubPlot(3) scale = thinkplot.Cdf(cdf, transform='exponential', color='C0') thinkplot.Config(title='CCDF on log-y scale', ylabel='log CCDF', **scale) thinkplot.SubPlot(4) xs, ys = thinkstats2.NormalProbability(data) thinkplot.Plot(xs, ys, color='C0') thinkplot.Config(title='Normal probability plot', xlabel='random normal', ylabel='data') thinkplot.SubPlot(5) scale = thinkplot.Cdf(cdf, transform='pareto', color='C0') thinkplot.Config(title='CCDF on log-log scale', ylabel='log CCDF', **scale) thinkplot.SubPlot(6) scale = thinkplot.Cdf(cdf, transform='weibull', color='C0') thinkplot.Config(title='CCDF on loglog-y log-x scale', ylabel='log log CCDF', **scale) thinkplot.Show(legend=False)
def main(): filename = 'mystery0.dat' data = read_file(filename) cdf = thinkstats2.MakeCdfFromList(data) thinkplot.SubPlot(2, 3, 1) thinkplot.Cdf(cdf) thinkplot.Config(title='linear') thinkplot.SubPlot(2, 3, 2) scale = thinkplot.Cdf(cdf, xscale='log') thinkplot.Config(title='logx', **scale) thinkplot.SubPlot(2, 3, 3) scale = thinkplot.Cdf(cdf, transform='exponential') thinkplot.Config(title='expo', **scale) thinkplot.SubPlot(2, 3, 4) xs, ys = thinkstats2.NormalProbability(data) thinkplot.Plot(xs, ys) thinkplot.Config(title='normal') thinkplot.SubPlot(2, 3, 5) scale = thinkplot.Cdf(cdf, transform='pareto') thinkplot.Config(title='pareto', **scale) thinkplot.SubPlot(2, 3, 6) scale = thinkplot.Cdf(cdf, transform='weibull') thinkplot.Config(title='weibull', **scale) thinkplot.Show()
def main(script, filename='mystery0.dat'): data = ReadFile(filename) cdf = thinkstats2.Cdf(data) thinkplot.PrePlot(rows=2, cols=3) thinkplot.SubPlot(1) thinkplot.Cdf(cdf) thinkplot.Config(title='linear') thinkplot.SubPlot(2) scale = thinkplot.Cdf(cdf, xscale='log') thinkplot.Config(title='logx', **scale) thinkplot.SubPlot(3) scale = thinkplot.Cdf(cdf, transform='exponential') thinkplot.Config(title='expo', **scale) thinkplot.SubPlot(4) xs, ys = thinkstats2.NormalProbability(data) thinkplot.Plot(xs, ys) thinkplot.Config(title='normal') thinkplot.SubPlot(5) scale = thinkplot.Cdf(cdf, transform='pareto') thinkplot.Config(title='pareto', **scale) thinkplot.SubPlot(6) scale = thinkplot.Cdf(cdf, transform='weibull') thinkplot.Config(title='weibull', **scale) thinkplot.Show(legend=False)
def MakeHists(male, female): """Plot Hists for live births live: DataFrame others: DataFrame """ thinkplot.PrePlot(rows=1, cols=2) hist = thinkstats2.Hist(male.alcwknd) thinkplot.SubPlot(1) thinkplot.Config(axis=[0, 800, 0, 600], ylabel='Number of people', xlabel='Alcohol consumed (grams)', title='Weekend Alcohol Consumption for Men') thinkplot.Hist(hist, alpha=1) hist = thinkstats2.Hist(female.alcwknd) thinkplot.SubPlot(2) thinkplot.Config(axis=[0, 800, 0, 1200], ylabel='Number of people', xlabel='Alcohol consumed (grams)', title='Weekend Alcohol Consumption for Women') thinkplot.Hist(hist, alpha=1) thinkplot.Show()
def MakeFigures(): """Plots the CDF of populations in several forms. On a log-log scale the tail of the CCDF looks like a straight line, which suggests a Pareto distribution, but that turns out to be misleading. On a log-x scale the distribution has the characteristic sigmoid of a lognormal distribution. The normal probability plot of log(sizes) confirms that the data fit the lognormal model very well. Many phenomena that have been described with Pareto models can be described as well, or better, with lognormal models. """ pops = ReadData() print('Number of cities/towns', len(pops)) log_pops = np.log10(pops) cdf = thinkstats2.Cdf(pops, label='data') cdf_log = thinkstats2.Cdf(log_pops, label='data') # pareto plot xs, ys = thinkstats2.RenderParetoCdf(xmin=5000, alpha=1.4, low=0, high=1e7) thinkplot.Plot(np.log10(xs), 1-ys, label='model', color='0.8') thinkplot.Cdf(cdf_log, complement=True) thinkplot.Config(xlabel='log10 population', ylabel='CCDF', yscale='log') thinkplot.Save(root='populations_pareto') # lognormal plot thinkplot.PrePlot(cols=2) mu, sigma = log_pops.mean(), log_pops.std() xs, ps = thinkstats2.RenderNormalCdf(mu, sigma, low=0, high=8) thinkplot.Plot(xs, ps, label='model', color='0.8') thinkplot.Cdf(cdf_log) thinkplot.Config(xlabel='log10 population', ylabel='CDF') thinkplot.SubPlot(2) thinkstats2.NormalProbabilityPlot(log_pops, label='data') thinkplot.Config(xlabel='z', ylabel='log10 population', xlim=[-5, 5]) thinkplot.Save(root='populations_normal')
def pmf_stuff(width, x_low, x_high, third, pmf_one, pmf_two, label, y_axis_scale): width = width axis = [x_low, x_high, third, y_axis_scale] thinkplot.PrePlot(2, cols=2) thinkplot.Hist(pmf_one, align='right', width=width) thinkplot.Hist(pmf_two, align='left', width=width) thinkplot.Config(xlabel=label, ylabel='PMF', axis=axis) thinkplot.PrePlot(2) thinkplot.SubPlot(2) thinkplot.Pmfs([pmf_one, pmf_two]) thinkplot.Config(xlabel=label, ylabel='PMF', axis=axis) thinkplot.Show()
def plot_bar_step(first_pmf, other_pmf): """PrePlot takes optional parameters rows and cols to make a grid of figures for bar grapg""" width = 0.5 thinkplot.PrePlot(2, cols=2) thinkplot.Hist(first_pmf, align="left", width=width) thinkplot.Hist(other_pmf, align="right", width=width) thinkplot.Config(xlabel="weeks", ylabel="probability", axis=[27, 46, 0, 0.6]) #for step graph thinkplot.PrePlot(2) thinkplot.SubPlot(2) thinkplot.Pmfs([first_pmf, other_pmf]) thinkplot.Show(xlabel="weeks", axis=[27, 46, 0, 0.6])
def NormalPlotSamples(samples, plot=1, ylabel=''): """Makes normal probability plots for samples. samples: list of samples label: string """ for n, sample in samples: thinkplot.SubPlot(plot) thinkstats2.NormalProbabilityPlot(sample) thinkplot.Config(title='n=%d' % n, legend=False, xticks=[], yticks=[], ylabel=ylabel) plot += 1
def PlotDailies(dailies): """Makes a plot with daily prices for different qualities. dailies: map from name to DataFrame """ thinkplot.PrePlot(rows=3) for i, (name, daily) in enumerate(dailies.items()): thinkplot.SubPlot(i + 1) title = 'price per gram ($)' if i == 0 else '' thinkplot.Config(ylim=[0, 20], title=title) thinkplot.Scatter(daily.ppg, s=10, label=name) if i == 2: pyplot.xticks(rotation=30) else: thinkplot.Config(xticks=[]) thinkplot.Save(root='timeseries1', formats=FORMATS)
def MakeAcfPlot(dailies): """Makes a figure showing autocorrelation functions. dailies: map from category name to DataFrame of daily prices """ axis = [0, 41, -0.2, 0.2] thinkplot.PrePlot(cols=2) PlotAutoCorrelation(dailies, add_weekly=False) thinkplot.Config(axis=axis, loc='lower right', ylabel='correlation', xlabel='lag (day)') thinkplot.SubPlot(2) PlotAutoCorrelation(dailies, add_weekly=True) thinkplot.Save(root='timeseries9', axis=axis, loc='lower right', xlabel='lag (days)')
def MakeBabyBoom(): """Plot CDF of interarrival time on log and linear scales. """ # compute the interarrival times df = ReadBabyBoom() diffs = df.minutes.diff() cdf = thinkstats2.Cdf(diffs, label='actual') thinkplot.PrePlot(cols=2) thinkplot.Cdf(cdf) thinkplot.Config(xlabel='minutes', ylabel='CDF', legend=False) thinkplot.SubPlot(2) thinkplot.Cdf(cdf, complement=True) thinkplot.Config(xlabel='minutes', ylabel='CCDF', yscale='log', legend=False) thinkplot.Save(root='analytic_interarrivals', legend=False)
def PlotMarriageData(resp): """Plots hazard and survival functions. resp: DataFrame of respondents """ hf, sf = EstimateMarriageSurvival(resp) thinkplot.PrePlot(rows=2) thinkplot.Plot(hf) thinkplot.Config(ylabel='hazard', legend=False) thinkplot.SubPlot(2) thinkplot.Plot(sf) thinkplot.Save(root='survival2', xlabel='age (years)', ylabel='prob unmarried', ylim=[0, 1], legend=False, formats=FORMATS) return sf
def PlotSurvival(complete): """Plots survival and hazard curves. complete: list of complete lifetimes """ thinkplot.PrePlot(3, rows=2) cdf = thinkstats2.Cdf(complete, label='cdf') sf = MakeSurvivalFromCdf(cdf, label='survival') print(cdf[13]) print(sf[13]) thinkplot.Plot(sf) thinkplot.Cdf(cdf, alpha=0.2) thinkplot.Config() thinkplot.SubPlot(2) hf = sf.MakeHazardFunction(label='hazard') print(hf[39]) thinkplot.Plot(hf) thinkplot.Config(ylim=[0, 0.75])
def PlotRollingMean(daily, name): """Plots rolling mean and EWMA. daily: DataFrame of daily prices """ dates = pandas.date_range(daily.index.min(), daily.index.max()) reindexed = daily.reindex(dates) thinkplot.PrePlot(cols=2) thinkplot.Scatter(reindexed.ppg, s=15, alpha=0.1, label=name) roll_mean = pandas.rolling_mean(reindexed.ppg, 30) thinkplot.Plot(roll_mean, label='rolling mean') pyplot.xticks(rotation=30) thinkplot.Config(ylabel='price per gram ($)') thinkplot.SubPlot(2) thinkplot.Scatter(reindexed.ppg, s=15, alpha=0.1, label=name) ewma = pandas.ewma(reindexed.ppg, span=30) thinkplot.Plot(ewma, label='EWMA') pyplot.xticks(rotation=30) thinkplot.Save(root='timeseries10', formats=FORMATS)
def MakeFigures(firsts, others): """Plot Pmfs of pregnancy length. firsts: DataFrame others: DataFrame """ # plot the PMFs first_pmf = thinkstats2.Pmf(firsts.prglngth, label='first') other_pmf = thinkstats2.Pmf(others.prglngth, label='other') width = 0.45 thinkplot.PrePlot(2, cols=2) thinkplot.Hist(first_pmf, align='right', width=width) thinkplot.Hist(other_pmf, align='left', width=width) thinkplot.Config(xlabel='weeks', ylabel='probability', axis=[27, 46, 0, 0.6]) thinkplot.PrePlot(2) thinkplot.SubPlot(2) thinkplot.Pmfs([first_pmf, other_pmf]) thinkplot.Save(root='probability_nsfg_pmf', xlabel='weeks', axis=[27, 46, 0, 0.6]) # plot the differences in the PMFs weeks = range(35, 46) diffs = [] for week in weeks: p1 = first_pmf.Prob(week) p2 = other_pmf.Prob(week) diff = 100 * (p1 - p2) diffs.append(diff) thinkplot.Bar(weeks, diffs) thinkplot.Save(root='probability_nsfg_diffs', title='Difference in PMFs', xlabel='weeks', ylabel='percentage points', legend=False)
def PlotMarriageData(): resp = chap01ex_soln.ReadFemResp() resp.cmmarrhx.replace([9997, 9998, 9999], np.nan, inplace=True) resp['agemarry'] = (resp.cmmarrhx - resp.cmbirth) / 12.0 cdf = thinkstats2.Cdf(resp.agemarry) resp['age'] = (resp.cmintvw - resp.cmbirth) / 12.0 cdf = thinkstats2.Cdf(resp.age) complete = resp[resp.evrmarry == 1].agemarry ongoing = resp[resp.evrmarry == 0].age hf = EstimateHazardFunction(complete, ongoing, label='hazard') sf = hf.MakeSurvival(label='survival') thinkplot.PrePlot(rows=2) thinkplot.Plot(hf) thinkplot.Config() thinkplot.SubPlot(2) thinkplot.Plot(sf) thinkplot.Save(root='survival2', xlabel='age (years)', ylim=[0, 1])
def main(): p1 = thinkbayes2.MakeNormalPmf(0, 1, 3, n=101) p1.label = 'p1' p2 = p1.Copy(label='p2') q1 = thinkbayes2.MakeNormalPmf(0, 1, 3, n=101) q1.label = 'q1' q2 = q1.Copy(label='q2') p1, q1 = Update(p1, q1, True) p1, q2 = Update(p1, q2, True) p2, q1 = Update(p2, q1, True) p2, q2 = Update(p2, q2, False) thinkplot.PrePlot(num=4, rows=2) thinkplot.Pmfs([p1, p2]) thinkplot.Config(legend=True) thinkplot.SubPlot(2) thinkplot.Pmfs([q1, q2]) thinkplot.Show() print('Prob p1 > p2', p1 > p2) print('Prob q1 > q2', q1 > q2)
def compareDetroitAirport(flights): """Create PMF to compare Atlanta airport versus other airports Per JD Power: Detroit Metropolitan Wayne County Airport ranks highest in passenger satisfaction among mega airports with a score of 786. https://www.jdpower.com/business/press-releases/2019-north-america-airport-satisfaction-study """ detroit = flights[flights.DESTINATION_AIRPORT == 'DTW'] others = flights[flights.AIRLINE != 'DTW'] detroit_pmf = thinkstats2.Pmf(detroit.ARRIVAL_DELAY, label='Detroit Metro Arrival Delay') other_pmf = thinkstats2.Pmf(others.ARRIVAL_DELAY, label='other') width = 0.45 thinkplot.PrePlot(2, cols=2) thinkplot.Hist(detroit_pmf, align='right', width=width) thinkplot.Hist(other_pmf, align='left', width=width) thinkplot.Save(root='-100to100DetroitDelayBarPMF', title='-100 to 100 min Arrival Delay', xlabel='detroit metro arrival delay', ylabel='probability -100 to 100 mins', axis=[-100, 100, 0, 0.032]) thinkplot.PrePlot(2) thinkplot.SubPlot(2) thinkplot.Pmfs([detroit_pmf, other_pmf]) thinkplot.Save(root='-100to100DetroitDelayStepPMF', title='-100 to 100 min Arrival Delay', xlabel='detroit metro arrival delay', ylabel='probability -100 to 100 mins', axis=[-100, 100, 0, 0.032]) thinkplot.PrePlot(2, cols=2) thinkplot.Hist(detroit_pmf, align='right', width=width) thinkplot.Hist(other_pmf, align='left', width=width) thinkplot.Save(root='-30to30DetroitDelayBarPMF', title='-30 to 30 min Arrival Delay', xlabel='detroit metro arrival delay', ylabel='probability -30 to 30 mins', axis=[-30, 30, 0, 0.032]) thinkplot.PrePlot(2) thinkplot.SubPlot(2) thinkplot.Pmfs([detroit_pmf, other_pmf]) thinkplot.Save(root='-30to30DetroitDelayStepPMF', title='-30 to 30 min Arrival Delay', xlabel='detroit metro arrival delay', ylabel='probability -30 to 30 mins', axis=[-30, 30, 0, 0.032]) thinkplot.PrePlot(2, cols=2) thinkplot.Hist(detroit_pmf, align='right', width=width) thinkplot.Hist(other_pmf, align='left', width=width) thinkplot.Save(root='-60to0DetroitDelayBarPMF', title='-60 to 0 min Arrival Delay', xlabel='detroit metro arrival delay', ylabel='probability -60 to 0 mins', axis=[-60, 0, 0, 0.032]) thinkplot.PrePlot(2) thinkplot.SubPlot(2) thinkplot.Pmfs([detroit_pmf, other_pmf]) thinkplot.Save(root='-60to0DetroitDelayStepPMF', title='-60 to 0 min Arrival Delay', xlabel='detroit metro arrival delay', ylabel='probability -60 to 0 mins', axis=[-60, 0, 0, 0.032]) thinkplot.PrePlot(2, cols=2) thinkplot.Hist(detroit_pmf, align='right', width=width) thinkplot.Hist(other_pmf, align='left', width=width) thinkplot.Save(root='0to60DetroitDelayBarPMF', title='0 to 60 min Arrival Delay', xlabel='detroit metro arrival delay', ylabel='probability 0 to 60 mins', axis=[0, 60, 0, 0.032]) thinkplot.PrePlot(2) thinkplot.SubPlot(2) thinkplot.Pmfs([detroit_pmf, other_pmf]) thinkplot.Save(root='0to60DetroitDelayStepPMF', title='0 to 60 min Arrival Delay', xlabel='detroit metro arrival delay', ylabel='probability 0 to 60 mins', axis=[0, 60, 0, 0.032])
def compareDay4(flights): """Create PMF to compare Day 4 (Thursday) with other days. I chose Day 4 (Thursday) because it showed the most flights for that day in the scatterplot """ labelString = "Day 4 Arrival Delay" xLabelString = "day 4 arrival delay" day = flights[flights.DAY == 4] others = flights[flights.DAY != 4] day_pmf = thinkstats2.Pmf(day.ARRIVAL_DELAY, label=labelString) other_pmf = thinkstats2.Pmf(others.ARRIVAL_DELAY, label='other') width = 0.45 thinkplot.PrePlot(2, cols=2) thinkplot.Hist(day_pmf, align='right', width=width) thinkplot.Hist(other_pmf, align='left', width=width) thinkplot.Save(root='Thursday-100to100ArrivalDelayBarPMF', title='-100 to 100 min Arrival Delay', xlabel=xLabelString, ylabel='probability -100 to 100 mins', axis=[-100, 100, 0, 0.032]) thinkplot.PrePlot(2) thinkplot.SubPlot(2) thinkplot.Pmfs([day_pmf, other_pmf]) thinkplot.Save(root='Thursday-100to100ArrivalDelayStepPMF', title='-100 to 100 min Arrival Delay', xlabel=xLabelString, ylabel='probability -100 to 100 mins', axis=[-100, 100, 0, 0.032]) thinkplot.PrePlot(2, cols=2) thinkplot.Hist(day_pmf, align='right', width=width) thinkplot.Hist(other_pmf, align='left', width=width) thinkplot.Save(root='Thursday-30to30ArrivalDelayBarPMF', title='-30 to 30 min Arrival Delay', xlabel=xLabelString, ylabel='probability -30 to 30 mins', axis=[-30, 30, 0, 0.032]) thinkplot.PrePlot(2) thinkplot.SubPlot(2) thinkplot.Pmfs([day_pmf, other_pmf]) thinkplot.Save(root='Thursday-30to30ArrivalDelayStepPMF', title='-30 to 30 min Arrival Delay', xlabel=xLabelString, ylabel='probability -30 to 30 mins', axis=[-30, 30, 0, 0.032]) thinkplot.PrePlot(2, cols=2) thinkplot.Hist(day_pmf, align='right', width=width) thinkplot.Hist(other_pmf, align='left', width=width) thinkplot.Save(root='Thursday-60to0ArrivalDelayBarPMF', title='-60 to 0 min Arrival Delay', xlabel=xLabelString, ylabel='probability -60 to 0 mins', axis=[-60, 0, 0, 0.032]) thinkplot.PrePlot(2) thinkplot.SubPlot(2) thinkplot.Pmfs([day_pmf, other_pmf]) thinkplot.Save(root='Thursday-60to0ArrivalDelayStepPMF', title='-60 to 0 min Arrival Delay', xlabel=xLabelString, ylabel='probability -60 to 0 mins', axis=[-60, 0, 0, 0.032]) thinkplot.PrePlot(2, cols=2) thinkplot.Hist(day_pmf, align='right', width=width) thinkplot.Hist(other_pmf, align='left', width=width) thinkplot.Save(root='Thursday0to60ArrivalDelayBarPMF', title='0 to 60 min Arrival Delay', xlabel=xLabelString, ylabel='probability 0 to 60 mins', axis=[0, 60, 0, 0.032]) thinkplot.PrePlot(2) thinkplot.SubPlot(2) thinkplot.Pmfs([day_pmf, other_pmf]) thinkplot.Save(root='Thursday0to60ArrivalDelayStepPMF', title='0 to 60 min Arrival Delay', xlabel=xLabelString, ylabel='probability 0 to 60 mins', axis=[0, 60, 0, 0.032])
def RunModel(self): np.random.shuffle(self.fake_data) return self.fake_data ## main scripts if __name__ == '__main__': ## read csv and group by quality and day transactions = pd.read_csv('mj-clean.csv', parse_dates=[5]) dailies = GroupByQualityAndDay(transactions) ## plot time series by quality thinkplot.PrePlot(rows=3) for i, (name, daily) in enumerate(dailies.items()): thinkplot.SubPlot(i + 1) title = 'Price per gram ($)' if i == 0 else '' thinkplot.Config(ylim=[0, 20], title=title) thinkplot.Scatter(daily.ppg, s=10, label=name) if i == 2: plt.xticks(rotation=30) thinkplot.Config() else: thinkplot.Config(xticks=[]) plt.show() ## calculate linear regressions for each quality for name, daily in dailies.items(): model, results = RunLinearModel(daily)
## make HexBin plot thinkplot.HexBin(heights, weights) thinkplot.Show(xlabel='Height (cm)', ylabel='Weight (kg)', axis=[140, 210, 20, 200], legend=False) ## now use entire dataset heights_all, weights_all = sample.htm3, sample.wtkg2 heights_all = Jitter(heights, 1.4) weights_all = Jitter(weights, 0.5) ## make scatter plot thinkplot.PrePlot(num=2, cols=2) thinkplot.SubPlot(1) thinkplot.Scatter(heights_all, weights_all, alpha=0.1, s=10) thinkplot.Show(xlabel='Height (cm)', ylabel='Weight (kg)', axis=[140, 210, 20, 200], legend=False) thinkplot.SubPlot(2) thinkplot.HexBin(heights_all, weights_all) thinkplot.Show(xlabel='Height (cm)', ylabel='Weight (kg)', axis=[140, 210, 20, 200], legend=False) ## bin data cleaned = df.dropna(subset=['htm3', 'wtkg2'])