def main(): df = hinc.ReadData() log_sample = InterpolateSample(df, log_upper=6.0) log_cdf = thinkstats2.Cdf(log_sample) thinkplot.Cdf(log_cdf) thinkplot.Show(xlabel='household income', ylabel='CDF') sample = np.power(10, log_sample) mean, median = density.Summarize(sample) cdf = thinkstats2.Cdf(sample) print('cdf[mean]', cdf[mean]) pdf = thinkstats2.EstimatedPdf(sample) thinkplot.Pdf(pdf) thinkplot.Show(xlabel='household income', ylabel='PDF')
# collect the arrays into a single sample log_sample = np.concatenate(arrays) return log_sample #%% # create a log_sample (using modified InterpolateSample) log_sample = InterpolateSample(df) #%% get the cdf and plot it log_cdf = thinkstats2.Cdf(log_sample) thinkplot.Cdf(log_cdf) # get a sample to calc mean, median sample = np.power(10, log_sample) mean, median = density.Summarize(sample) #print("The mean is: {}".format(mean)) #print("The median is: {}".format(median)) #%% # fraction of households below the mean cdf = thinkstats2.Cdf(sample) print('The fraction of households below the mean: {:.2f}'.format(cdf[mean])) #%% [markdown] # How do the results change with the upper bound? # <br> # The upper bound effects the skewness. As the upper bound increases the moment-based skewness would also increase. The mean is only somewhat affected by an increase in the upper bound. Since the standard deviation is in the denominator it will have a strong effect on the results.