def MakeDF(script, nrows=1000): """Tests the functions in this module. script: string script name """ thinkstats2.RandomSeed(17) nrows = int(nrows) df = ReadBrfss(nrows=nrows) #MakeFigures(df) Summarize(df, 'htm3', 'Height (cm):') Summarize(df, 'wtkg2', 'Weight (kg):') Summarize(df, 'wtyrago', 'Weight year ago (kg):') if nrows == 1000: assert(df.age.value_counts()[40] == 28) assert(df.sex.value_counts()[2] == 668) assert(df.wtkg2.value_counts()[90.91] == 49) assert(df.wtyrago.value_counts()[160/2.2] == 49) assert(df.htm3.value_counts()[163] == 103) assert(df.finalwt.value_counts()[185.870345] == 13) print('%s: All tests passed.' % script) return df
def main(): thinkstats2.RandomSeed(17) Estimate1() Estimate2() Estimate3(m=1000) SimulateSample()
def MakeFigures(): """Make scatterplots. """ thinkstats2.RandomSeed(17) df = brfss.ReadBrfss(nrows=None) sample = SampleRows(df, 5000, replace=False) heights, weights = GetHeightWeight(sample) assert (heights.values[100] == 175) assert (weights.values[100] == 86.36) ScatterPlot('brfss_scatter1', heights, weights) heights, weights = GetHeightWeight(sample, hjitter=1.5, wjitter=1.1) assert (int(heights.values[100]) == 173) assert (int(weights.values[100]) == 85) ScatterPlot('brfss_scatter2', heights, weights) ScatterPlot('brfss_scatter3', heights, weights, alpha=0.1) # make a hexbin of all records heights, weights = GetHeightWeight(df, hjitter=1.3, wjitter=1.1) assert (int(heights.values[100]) == 171) assert (int(weights.values[100]) == 55) HexBin('brfss_scatter4', heights, weights)
def main(): thinkstats2.RandomSeed(18) live, firsts, others = first.MakeFrames() n = len(live) for _ in range(7): sample = thinkstats2.SampleRows(live, n) RunTests(sample) n //= 2
def main(): thinkstats2.RandomSeed(17) live, firsts, others = first.MakeFrames() PlotAdultWeights(live) PlotPregLengths(live, firsts, others) TestIntervention()
def main(name, data_dir=''): thinkstats2.RandomSeed(17) MakeExample() live, firsts, others = first.MakeFrames() RandomFigure(live) TestSample(live) MakeCdf(live) MakeFigures(live, firsts, others)
def main(): thinkstats2.RandomSeed(17) # Estimate1() # Estimate2() # Estimate3(m=1000) # SimulateSample() # ex1() # ex2() ex3()
def main(script): thinkstats2.RandomSeed(17) df = brfss.ReadBrfss(nrows=None) df = df.dropna(subset=['htm3', 'wtkg2']) Correlations(df) return MakeFigures(df) BinnedPercentiles(df)
def main(): thinkstats2.RandomSeed(17) live, _, _ = first.MakeFrames() EstimateBirthWeight(live) live = live.dropna(subset=['agepreg', 'totalwgt_lb']) PlotSamplingDistributions(live) PlotFit(live) PlotResiduals(live)
def main(): #random seed saves the random samples thinkstats2.RandomSeed(23) live, firsts, others = first.MakeFrames() RunResampleTest(firsts, others) n = len(live) for _ in range(7): sample = thinkstats2.SampleRows(live, n) RunTests(sample) n //= 2
def main(): thinkstats2.RandomSeed(17) Estimate4() return for n in [10, 100, 1000]: stderr = SimulateSample(n=n) print(n, stderr) Estimate1() Estimate2()
def main(name, data_dir='.'): thinkstats2.RandomSeed(17) LogisticRegressionExample() live, firsts, others = first.MakeFrames() live['isfirst'] = (live.birthord == 1) RunLogisticModels(live) RunSimpleRegression(live) RunModels(live) PredictBirthWeight(live)
def main(script): thinkstats2.RandomSeed(17) live, firsts, others = first.MakeFrames() live = live.dropna(subset=['agepreg', 'totalwgt_lb']) BinnedPercentiles(live) ages = live.agepreg weights = live.totalwgt_lb print('thinkstats2 Corr', thinkstats2.Corr(ages, weights)) print('thinkstats2 SpearmanCorr', thinkstats2.SpearmanCorr(ages, weights)) ScatterPlot(ages, weights, alpha=0.1) thinkplot.Save(root='chap07scatter1', legend=False, formats=['jpg'])
def main(name, nrows=None): thinkstats2.RandomSeed(17) if nrows is not None: nrows = int(nrows) df = brfss.ReadBrfss(nrows=nrows) columns = df[['htm3', 'wtkg2']].dropna() heights, weights = columns.htm3.values, columns.wtkg2.values TestCorrelation(heights, weights) if nrows == None: ComputeCorrelations(heights, weights)
def main(): thinkstats2.RandomSeed(17) # run the coin test ct = CoinTest((140, 110)) pvalue = ct.PValue() print('coin test p-value', pvalue) # compare pregnancy lengths print('\nprglngth') live, firsts, others = first.MakeFrames() data = firsts.prglngth.values, others.prglngth.values RunTests(data) # compare birth weights print('\nbirth weight') data = (firsts.totalwgt_lb.dropna().values, others.totalwgt_lb.dropna().values) ht = DiffMeansPermute(data) p_value = ht.PValue(iters=1000) print('means permute two-sided') PrintTest(p_value, ht) # test correlation live2 = live.dropna(subset=['agepreg', 'totalwgt_lb']) data = live2.agepreg.values, live2.totalwgt_lb.values ht = CorrelationPermute(data) p_value = ht.PValue() print('\nage weight correlation') print('n=', len(live2)) PrintTest(p_value, ht) # run the dice test RunDiceTest() # compare pregnancy lengths (chi-squared) data = firsts.prglngth.values, others.prglngth.values ht = PregLengthTest(data) p_value = ht.PValue() print('\npregnancy length chi-squared') PrintTest(p_value, ht) # compute the false negative rate for difference in pregnancy length data = firsts.prglngth.values, others.prglngth.values neg_rate = FalseNegRate(data) print('false neg rate', neg_rate) # run the tests with new nsfg data ReplicateTests()
def main(): thinkstats2.RandomSeed(17) MakeCltPlots() print('Gorilla example') dist = Normal(90, 7.5**2) print(dist) dist_xbar = dist.Sum(9) / 9 print(dist_xbar.sigma) print(dist_xbar.Percentile(5), dist_xbar.Percentile(95)) live, firsts, others = first.MakeFrames() TestCorrelation(live) PlotPregLengths(live, firsts, others) TestChiSquared()
def main(): thinkstats2.RandomSeed(17) # get the data live, firsts, others = first.MakeFrames() mean_var = thinkstats2.MeanVar(live.prglngth) print('(Mean, Var) of prglength for live births', mean_var) data = firsts.prglngth.values, others.prglngth.values # test the difference in means ht = DiffMeansPermute(data) p_value = ht.PValue(iters=1000) print('p-value =', p_value) ht.PlotCdf() thinkplot.Save(root='hypothesis1', title='Permutation test', xlabel='difference in means (weeks)', ylabel='CDF', legend=False) # test the difference in std ht = DiffStdPermute(data) p_value = ht.PValue(iters=1000) print('p-value =', p_value) ht.PlotCdf() thinkplot.Save(root='hypothesis2', title='Permutation test', xlabel='difference in std (weeks)', ylabel='CDF', legend=False) # test the difference in means by resampling ht = DiffStdPermute(data) p_value = ht.PValue(iters=1000) print('p-value =', p_value) ht.PlotCdf() thinkplot.Save(root='hypothesis3', title='Resampling test', xlabel='difference in means (weeks)', ylabel='CDF', legend=False)
def main(name): thinkstats2.RandomSeed(18) transactions = ReadData() dailies = GroupByQualityAndDay(transactions) PlotDailies(dailies) RunModels(dailies) PrintSerialCorrelations(dailies) MakeAcfPlot(dailies) name = 'high' daily = dailies[name] PlotLinearModel(daily, name) PlotRollingMean(daily, name) PlotFilled(daily, name) years = np.linspace(0, 5, 101) thinkplot.Scatter(daily.years, daily.ppg, alpha=0.1, label=name) PlotPredictions(daily, years) xlim = years[0] - 0.1, years[-1] + 0.1 thinkplot.Save(root='timeseries4', title='predictions', xlabel='years', xlim=xlim, ylabel='price per gram ($)', formats=FORMATS) name = 'medium' daily = dailies[name] thinkplot.Scatter(daily.years, daily.ppg, alpha=0.1, label=name) PlotIntervals(daily, years) PlotPredictions(daily, years) xlim = years[0] - 0.1, years[-1] + 0.1 thinkplot.Save(root='timeseries5', title='predictions', xlabel='years', xlim=xlim, ylabel='price per gram ($)', formats=FORMATS)
def main(): thinkstats2.RandomSeed(18) MakeExampleNormalPlot() # make the analytic CDFs MakeExpoCdf() MakeBabyBoom() MakeParetoCdf() MakeParetoCdf2() MakeNormalCdf() # test the distribution of birth weights for normality preg = nsfg.ReadFemPreg() full_term = preg[preg.prglngth >= 37] weights = preg.totalwgt_lb.dropna() term_weights = full_term.totalwgt_lb.dropna() MakeNormalModel(weights) MakeNormalPlot(weights, term_weights)
def main(): thinkstats2.RandomSeed(17) preg = nsfg.ReadFemPreg() sf1 = PlotPregnancyData(preg) # make the plots based on Cycle 6 resp6 = ReadFemResp2002() sf2 = PlotMarriageData(resp6) ResampleSurvival(resp6) PlotRemainingLifetime(sf1, sf2) # read Cycles 5 and 7 resp5 = ReadFemResp1995() resp7 = ReadFemResp2010() # plot resampled survival functions by decade resps = [resp5, resp6, resp7] PlotResampledByDecade(resps) thinkplot.Save(root='survival4', xlabel='age (years)', ylabel='prob unmarried', xlim=[13, 45], ylim=[0, 1], formats=FORMATS) # plot resampled survival functions by decade, with predictions PlotResampledByDecade(resps, predict_flag=True, omit=[5]) thinkplot.Save(root='survival5', xlabel='age (years)', ylabel='prob unmarried', xlim=[13, 45], ylim=[0, 1], formats=FORMATS)
def main(): thinkstats2.RandomSeed(17) # make the plots based on Cycle 6 resp6 = ReadFemResp2002() resps = [resp6] sf_map = ResampleSurvivalByDecade(resps) sf_map_pred = ResampleSurvivalByDecade(resps, predict_flag=True) PlotSurvivalFunctions(sf_map) thinkplot.Save(root='marriage1', formats=['pdf']) return resp8 = ReadFemResp2013() Validate2013(resp8) return resp7 = ReadFemResp2010() Validate2010(resp7) return resp6 = ReadFemResp2002() Validate2002(resp6) return resp5 = ReadFemResp1995() Validate1995(resp5) return resp4 = ReadFemResp1988() Validate1988(resp4) return resp3 = ReadFemResp1982() Validate1982(resp3) return
def main(): thinkstats2.RandomSeed(17) MakePdfExample() ComputeSkewnesses()
def main(): thinkstats2.RandomSeed(17) flights = ReadFlightData() # print(flights.head()) # print(flights.DESTINATION_AIRPORT.to_string(index=False)) airlines = ReadAirlineData() # print(airlines.head()) airports = ReadAirportData() # print(airports.head()) """ A minimum of 5 variables in your dataset used during your analysis (for help with selecting, the author made his selection on page 6 of your book). Consider what you think could have an impact on your question – remember this is never perfect, so don’t be worried if you miss one (Chapter 1). Describe what the 5 variables mean in the dataset (Chapter 1). DAY_OF_WEEK - Integer 1 - 7 corresponding to the day of the week. 1 is Monday and 7 is Sunday. AIRLINE - Letter code corresponding to the airline for the flight. ORIGIN_AIRPORT - Airport code corresponding to the flight's origin airport. DESTINATION_AIRPORT - Airport code corresponding to the flight's destination airport. DEPARTURE_DELAY - Integer value corresponding to the departure delay for the flight. Computed from SCHEDULED_DEPARTURE and DEPARTURE_TIME. ARRIVAL_DELAY - Integer value corresponding to the arrival delay for the flight. Computed from SCHEDULED_ARRIVAL and ARRIVAL_TIME. """ """Include a histogram of each of the 5 variables – in your summary and analysis, identify any outliers and explain the reasoning for them being outliers and how you believe they should be handled (Chapter 2). Include the other descriptive characteristics about the variables: Mean, Mode, Spread, and Tails (Chapter 2). """ createHistograms(flights, airlines, airports) alaska = flights[flights.AIRLINE == 'AS'] # print(alaska.head()) notAlaska = flights[flights.AIRLINE != 'AS'] # print(notAlaska.head()) """Using pg. 29 of your text as an example, compare two scenarios in your data using a PMF. Reminder, this isn’t comparing two variables against each other – it is the same variable, but a different scenario. Almost like a filter. The example in the book is first babies compared to all other babies, it is still the same variable, but breaking the data out based on criteria we are exploring (Chapter 3). """ compareAlaskaAirlinesPmf(alaska, notAlaska) compareDetroitAirport(flights) compareDay4(flights) """ Create 1 CDF with one of your variables, using page 41-44 as your guide, what does this tell you about your variable and how does it address the question you are trying to answer (Chapter 4). """ compareAlaskaAirlinesCdf(alaska, notAlaska) arrivalDelays = flights.ARRIVAL_DELAY.dropna() """ Plot 1 analytical distribution and provide your analysis on how it applies to the dataset you have chosen (Chapter 5). """ MakeNormalModel(arrivalDelays) MakeNormalPlot(arrivalDelays) """ Create two scatter plots comparing two variables and provide your analysis on correlation and causation. Remember, covariance, Pearson’s correlation, and Non-Linear Relationships should also be considered during your analysis (Chapter 7). """ MakeAirlineArrivalDelayScatterPlots(flights) MakeArrivalDepartureDelayScatterPlots(flights) ComputeArrivalDepartureDelayCorrelations(flights) ComputeAirlineArrivalDelayCorrelations(flights) # Remove data with missing arrival delay # It seems most of the rows in the set with missing arrival delay is also missing values for other attributes # I do not feel this will have an impact for this analysis. """ Conduct a test on your hypothesis using one of the methods covered in Chapter 9. """ hypothesisTestData = alaska.ARRIVAL_DELAY.dropna( ).values, notAlaska.ARRIVAL_DELAY.dropna().values RunAlaskaTests(hypothesisTestData) """ For this project, conduct a regression analysis on either one dependent and one explanatory variable, or multiple explanatory variables (Chapter 10 & 11). """ PlotAirlineArrivalDelayFit(flights) PlotArrivalDepartureDelayFit(flights)
live2 = live.dropna(subset=['agepreg', 'totalwgt_lb']) data = live2.agepreg.values, live2.totalwgt_lb.values ht = hypothesis.CorrelationPermute(data) p3 = ht.PValue(iters=iters) # compare pregnancy lengths (chi-squared) data = firsts.prglngth.values, others.prglngth.values ht = hypothesis.PregLengthTest(data) p4 = ht.PValue(iters=iters) print("{}\t{:.3f}\t{:.3f}\t{:.3f}\t{:.3f}".format(n, p1, p2, p3, p4)) #%% # set the random generators thinkstats2.RandomSeed(18) # get the wght and length live, firsts, others = first.MakeFrames() RunSampleTest(first, others) #%% # run the test n = len(live) print("nval\t Test1\t Test2\t Test3\t Test4\t") for i in range(7): sample = thinkstats2.SampleRows(live, n) RunTests(sample) n //= 2 #%% [markdown]
def main(): thinkstats2.RandomSeed(17) MakeFigures()