def main(): df = hinc.ReadData() log_sample = InterpolateSample(df, log_upper=6.0) log_cdf = thinkstats2.Cdf(log_sample) print("median", thinkstats2.Median(log_sample)) print("pearson's median skewness", thinkstats2.PearsonMedianSkewness(log_sample)) print("skewness", thinkstats2.Skewness(log_sample)) print("mean", log_cdf.Mean()) print( "the higher our log_upper, the more right-skewed (according to g_1) or at least less left-skewed (according to g_p) things get" ) print("the mean moves to the right a bit, too.") print("proportion of the population with income < mean", log_cdf.Prob(log_cdf.Mean())) print( "the higher the upper bound, the greater the proprtion below the mean." ) thinkplot.Cdf(log_cdf) thinkplot.Show(xlabel='household income', ylabel='CDF')
def main(): df = hinc.ReadData() log_sample = InterpolateSample(df, log_upper=6.0) log_cdf = thinkstats2.Cdf(log_sample) thinkplot.Cdf(log_cdf) thinkplot.Show(xlabel='household income', ylabel='CDF')
def main(): df = hinc.ReadData() log_sample = InterpolateSample(df, log_upper=6.0) log_cdf = thinkstats2.Cdf(log_sample) thinkplot.Cdf(log_cdf) thinkplot.Show(xlabel='household income', ylabel='CDF') sample = np.power(10, log_sample) mean, median = density.Summarize(sample) cdf = thinkstats2.Cdf(sample) print('cdf[mean]', cdf[mean]) pdf = thinkstats2.EstimatedPdf(sample) thinkplot.Pdf(pdf) thinkplot.Show(xlabel='household income', ylabel='PDF')
def main(): df = hinc.ReadData() log_sample = InterpolateSample(df, log_upper=6.0) log_cdf = thinkstats2.Cdf(log_sample) thinkplot.Cdf(log_cdf) thinkplot.Show(xlabel='household income', ylabel='CDF') sample = np.power(10, log_sample) mean = np.mean(sample) cdf = thinkstats2.Cdf(sample) print "Median:", np.median(sample) print "Mean:", mean print "Skewness:", thinkstats2.Skewness(sample) print "Pearson's Skewness:", thinkstats2.PearsonMedianSkewness(sample) print "Percent of people with incomes <= mean:", cdf[mean] pdf = thinkstats2.EstimatedPdf(sample) thinkplot.Pdf(pdf)
import hinc import hinc2 import thinkstats2 import thinkplot import numpy as np df = hinc.ReadData() def describe_inc_dist(log_upper): log_sample = hinc2.InterpolateSample(df, log_upper=j) incomes = np.power(10, log_sample) inc_mean = thinkstats2.Mean(incomes) inc_med = thinkstats2.Median(incomes) inc_skew = thinkstats2.Skewness(incomes) inc_pearskew = thinkstats2.PearsonMedianSkewness(incomes) print('log_upper = ', j) print('Mean Income: ', inc_mean) print('Median Income: ', inc_med) print('Skewness: ', inc_skew) print('Pearson Median Skewness: ', inc_pearskew) cdf = thinkstats2.Cdf(incomes) inc_below_mean = cdf.Prob(inc_mean) print('Pct. below mean: ', inc_below_mean) print('\n') for j in [6.0, 7.0, 8.0]: describe_inc_dist(log_upper=j)
def main(): df = hinc.ReadData() MakeFigures(df)
df.loc[41, 'log_upper'] = log_upper # use the freq column to generate the right number of values in # each range arrays = [] for _, row in df.iterrows(): vals = np.linspace(row.log_lower, row.log_upper, row.freq) arrays.append(vals) # collect the arrays into a single sample log_sample = np.concatenate(arrays) return log_sample # get data import hinc income_df = hinc.ReadData() log_sample = InterpolateSample(income_df, log_upper=6.0) log_cdf = thinkstats2.Cdf(log_sample) thinkplot.Cdf(log_cdf) thinkplot.Config(xlabel='Household income (log $)', ylabel='CDF') # RESULTS: plot sample = np.power(10, log_sample) cdf = thinkstats2.Cdf(sample) thinkplot.Cdf(cdf) thinkplot.Config(xlabel='Household income ($)', ylabel='CDF')
def RunModel(self): """ Goal: Use resampling to simulate test data Output: Simulated data """ group1 = np.random.choice(self.pool, self.n, replace=True) group2 = np.random.choice(self.pool, self.m, replace=True) data = group1, group2 return data preg = nsfg.ReadFemPreg() live = preg[preg["outcome"] == 1] resp = nsfg.ReadFemResp() bs = brfss.ReadBrfss() income = hinc.ReadData() log_intp_income = hinc2.InterpolateSample(income, log_upper=6.0) # Q1. Think Stats Chapter 2 Exercise 4 (effect size of Cohen's d) first_wt = live.loc[preg["birthord"] == 1, "totalwgt_lb"] other_wt = live.loc[preg["birthord"] != 1, "totalwgt_lb"] ts.CohenEffectSize(first_wt, other_wt) # Q2. Think Stats Chapter 3 Exercise 1 (actual vs. biased) d = np.diff(np.unique(resp["numkdhh"])).min() left_of_first_bin = resp["numkdhh"].min() - float(d) / 2 right_of_last_bin = resp["numkdhh"].max() + float(d) / 2 plt.clf() plt.hist(resp["numkdhh"], bins=np.arange(left_of_first_bin, right_of_last_bin + d, d), histtype="step",