def PartitionRecords(table): """Divides records into two lists: first babies and others. Only live births are included Args: table: pregnancy Table """ firsts = survey.Pregnancies() others = survey.Pregnancies() for p in table.records: # skip non-live births if p.outcome != 1: continue if p.birthord == 1: firsts.AddRecord(p) else: others.AddRecord(p) return firsts, others
def PoolRecords(*tables): """Construct a table with records from all tables. Args: constructor: init method used to make the new table tables: any number of tables Returns: new table object """ pool = survey.Pregnancies() for table in tables: pool.ExtendRecords(table.records) return pool
def main(): # Exercise 3.11 table = survey.Pregnancies() table.ReadRecords() unfilteredLiveBirthWeights = [(p.birthwgt_lb, p.birthwgt_oz) for p in table.records if p.outcome == 1] liveBirthWeights = [ lbs * 16 + oz for lbs, oz in unfilteredLiveBirthWeights if type(lbs) == int and type(oz) == int and lbs * 16 + oz <= 200 ] liveBirthWeightsCdf = Cdf.MakeCdfFromList(liveBirthWeights, name="live birth weights") print('25th: %d 50th: %d 75th: %d interquartile range: %d' % (liveBirthWeightsCdf.Value(.25), median(liveBirthWeightsCdf), liveBirthWeightsCdf.Value(.75), interquartile(liveBirthWeightsCdf)))
def Summarize(data_dir): """Prints summary statistics for first babies and others. Returns: tuple of Tables """ table = survey.Pregnancies() table.ReadRecords(data_dir) # make a map from caseid to list of pairs d = {} for record in table.records: # skip non-live births if record.outcome != 1: continue # skip multiple births if record.nbrnaliv > 1: continue pair = record.birthord, record.prglength d.setdefault(record.caseid, []).append(pair) print len(d) # find all caseids with more than one live birth pmf = Pmf() for caseid, t in d.iteritems(): if len(t) <= 1: continue t.sort() _, prglength1 = t[0] _, prglength2 = t[1] if prglength1 < 15 or prglength2 < 15: continue diff = prglength2 - prglength1 if abs(diff) > 15: print caseid, prglength1, prglength2 pmf.Incr(diff) pmf.Normalize() return pmf
def PartitionBabies(): firsts, others, babies = [],[],[] table = survey.Pregnancies() table.ReadRecords('res') for baby in table.records: if baby.outcome != 1: continue data = (baby.prglength, baby.totalwgt_oz) babies.append(data) if baby.birthord == 1: firsts.append(data) else: others.append(data) return firsts, others, babies
def main(name, data_dir='.'): table = survey.Pregnancies() table.ReadRecords(data_dir) print "Number of Pregnancies: ", len(table.records) live_births = 0 tot_births = 0 first_babies = [] other_bablies = [] for record in table.records: tot_births += 1 if record.outcome == 1: live_births += 1 if record.birthord == 1: first_babies.append(record) else: other_bablies.append(record) print "live_births: ", live_births, " out of: ", tot_births, " records." print "first_babies: ", len(first_babies) print "other_bablies: ", len(other_bablies) print thinkstats.Mean([1, 1, 1, 3, 3, 591])
def GetDurations(data_dir, keep_codes): """Reads pregnancy durations from NSFG data. data_dir: location of the data file """ preg = survey.Pregnancies() preg.ReadRecords(data_dir) print 'Number of pregnancies', len(preg.records) pmf = thinkstats2.Pmf() for record in preg.records: pmf.Incr(record.outcome) pmf.Print() durations = [ record.prglength for record in preg.records if record.outcome in keep_codes ] print 'Number of relevant pregnancies', len(durations) return durations
def main(): # Exercise 3.9 table = survey.Pregnancies() table.ReadRecords() unfilteredLiveBirthWeights = [(p.birthwgt_lb, p.birthwgt_oz) for p in table.records if p.outcome == 1] liveBirthWeights = [ lbs * 16 + oz for lbs, oz in unfilteredLiveBirthWeights if type(lbs) == int and type(oz) == int and lbs * 16 + oz <= 200 ] liveBirthWeightsCdf = Cdf.MakeCdfFromList(liveBirthWeights, name="live birth weights") samepleListLiveBirthWeights = sample(liveBirthWeightsCdf, 1000) myplot.Cdf(Cdf.MakeCdfFromList(samepleListLiveBirthWeights)) myplot.show(title="CDF of live births resampled") # Exercise 3.10 randomList = [random.random() for x in range(1000)] myplot.Pmf(Pmf.MakePmfFromList(randomList)) myplot.show(title="random pmf") myplot.Cdf(Cdf.MakeCdfFromList(randomList)) myplot.Show(title="random cdf")
def main(): table = survey.Pregnancies() table.read_records() # Calculate the first babies and other babies averages. firsts, others = first.collect_live_births(table) firsts_average, others_average = first.averages(firsts, others) # Get the pregnancy lengths for first babies and others. firsts_lengths = [r.prglength for r in firsts] others_lengths = [r.prglength for r in others] # Compute the variance for first babies. firsts_var = thinkstats.variance(firsts_lengths, firsts_average) # Compute the variance for other babies. others_var = thinkstats.variance(others_lengths, others_average) # Show standard deviations. print 'Standard deviation for first gestations: {0}'.format( math.sqrt(firsts_var)) print 'Standard deviation for other gestations: {0}'.format( math.sqrt(others_var))
def testPregnancies(self): preg = survey.Pregnancies() preg.ReadRecords() self.assertEquals(len(preg.records), 13593) hist = MakeHist(preg, 'nbrnaliv') self.assertEquals(hist.Freq(1), 8981) hist = MakeHist(preg, 'babysex') self.assertEquals(hist.Freq(1), 4641) self.assertEquals(hist.Freq(2), 4500) hist = MakeHist(preg, 'outcome') self.assertEquals(hist.Freq(1), 9148) hist = MakeHist(preg, 'birthord') self.assertEquals(hist.Freq(1), 4413) hist = MakeHist(preg, 'birthwgt_lb') self.assertEquals(hist.Freq(6), 2223) hist = MakeHist(preg, 'birthwgt_oz') self.assertEquals(hist.Freq(6), 709) hist = MakeHist(preg, 'agepreg') self.assertEquals(hist.Freq('NA'), 352) self.assertEquals(hist.Freq(25.0), 58) hist = MakeHist(preg, 'totalwgt_oz') self.assertEquals(hist.Freq('NA'), 4509) hist = MakeHist(preg, 'finalwgt') t = hist.Values() low, high = min(t), max(t) self.assertAlmostEquals(low, 118.656789706) self.assertAlmostEquals(high, 261879.9538641)
def main(): table = survey.Pregnancies() table.read_records() print 'Number of pregnancies: {0}'.format(len(table.records)) # 2nd exercise live = live_births(table) print 'Live birhs: {0}'.format(live) # 3rd exercise firsts, others = collect_live_births(table) print 'First babies: {0}'.format(len(firsts)) print 'Other babies: {0}'.format(len(others)) # 4th exercise firsts_average, others_average = averages(firsts, others) difference_days = (firsts_average - others_average) * 7.0 print 'Difference in days: {0}'.format(difference_days) print 'Difference in hours: {0}'.format(difference_days * 24.0)
import survey table = survey.Pregnancies() table.ReadRecords() print 'Num of pregnancies', len(table)
def main(): table = survey.Pregnancies() table.ReadRecords() unfilteredLiveBirthWeights = [(p.birthwgt_lb, p.birthwgt_oz) for p in table.records if p.outcome == 1] liveBirthWeights = [lbs * 16 + oz for lbs, oz in unfilteredLiveBirthWeights if type(lbs) == int and type(oz) == int and lbs * 16 + oz <= 200] print(liveBirthWeights[0])
def MakeTables(data_dir='.'): """Reads survey data and returns tables for first babies and others.""" table = survey.Pregnancies() table.ReadRecords(data_dir) firsts, others = PartitionRecords(table) return table, firsts, others
import survey import Pmf import matplotlib.pyplot as plt import numpy as np table = survey.Pregnancies() table.readRecords(data_dir='data') firsts_babies = survey.Pregnancies() others_babies = survey.Pregnancies() for r in table.records: if r.outcome != 1: continue if r.birthord == 1: firsts_babies.addRecord(r) else: others_babies.addRecord(r) firsts_prglengths = [r.prglength for r in firsts_babies.records] others_prglengths = [r.prglength for r in others_babies.records] firts_hist = Pmf.Hist(firsts_prglengths) others_hist = Pmf.Hist(others_prglengths) times1, values1 = firts_hist.render() times2, values2 = others_hist.render() times2 = np.array(times2) plt.bar(times1, values1, width=0.45) plt.bar(times2 + 0.45, values2, width=0.45)
import survey table = survey.Pregnancies() table.ReadRecords() firsts = survey.Pregnancies() others = survey.Pregnancies() for p in table.records: if p.outcome != 1: continue else: if p.birthord == 1: firsts.AddRecord(p) else: others.AddRecord(p) print 'Number of first babies', len(firsts.records) print 'Number of other babies', len(others.records)
def PoolRecords(*tables): ''' 合并数据表 ''' pool = survey.Pregnancies() for table in tables: pool.ExtendRecords(table.records) return pool
def ReadPregnancyRecords(): """Reads survey data and returns a table of records.""" table = survey.Pregnancies() table.ReadRecords() return table
#!/usr/bin/env python import survey import thinkstats TABLE = survey.Pregnancies() TABLE.ReadRecords() def ex1_3_2(): """Number of live births """ d = {'live': 0, 'death': 0} for r in TABLE.records: if r.outcome == 1: d['live'] += 1 elif r.outcome == 2: d['death'] += 1 return d def ex1_3_3(): """Number of live, first births vs. non-first """ d = {'first': 0, 'other': 0} for r in TABLE.records: if r.outcome == 1: if r.birthord == 1: d['first'] += 1
def makeTables(data_dir): table = survey.Pregnancies() table.readRecords(data_dir) firsts, others = partitionRecords(table) return table, firsts, others
import survey import thinkstats import math pregnancies = survey.Pregnancies() pregnancies.ReadRecords() print 'Number of pregnancies', len(pregnancies.records) print preg_lengths_first = [] preg_lengths_others = [] for preg in pregnancies.records: if preg.outcome != 1: continue if preg.birthord == 1: preg_lengths_first.append(preg.prglength) else: preg_lengths_others.append(preg.prglength) pregs_first = len(preg_lengths_first) mean_length_first, var_length_first = thinkstats.MeanVar(preg_lengths_first) std_length_first = math.sqrt(var_length_first) pregs_others = len(preg_lengths_others) mean_length_others, var_length_others = thinkstats.MeanVar(preg_lengths_others) std_length_others = math.sqrt(var_length_others) print 'Number of live births, first child', pregs_first print 'Mean pregnancy length (weeks), first child', mean_length_first print 'Variance of gestation time, first child', var_length_first