Example #1
0
def main():
    df = hinc.ReadData()
    log_sample = InterpolateSample(df, log_upper=6.0)

    log_cdf = thinkstats2.Cdf(log_sample)
    thinkplot.Cdf(log_cdf)
    thinkplot.Show(xlabel='household income', ylabel='CDF')

    sample = np.power(10, log_sample)
    mean, median = density.Summarize(sample)

    cdf = thinkstats2.Cdf(sample)
    print('cdf[mean]', cdf[mean])

    pdf = thinkstats2.EstimatedPdf(sample)
    thinkplot.Pdf(pdf)
    thinkplot.Show(xlabel='household income', ylabel='PDF')
    # collect the arrays into a single sample
    log_sample = np.concatenate(arrays)
    return log_sample

#%%
# create a log_sample (using modified InterpolateSample)
log_sample = InterpolateSample(df)

#%% get the cdf and plot it
log_cdf = thinkstats2.Cdf(log_sample)
thinkplot.Cdf(log_cdf)

# get a sample to calc mean, median
sample = np.power(10, log_sample)

mean, median = density.Summarize(sample)

#print("The mean is: {}".format(mean))
#print("The median is: {}".format(median))

#%%
# fraction of households below the mean
cdf = thinkstats2.Cdf(sample)
print('The fraction of households below the mean: {:.2f}'.format(cdf[mean]))

#%% [markdown]
# How do the results change with the upper bound?
# <br>
# The upper bound effects the skewness. As the upper bound increases the moment-based skewness would also increase. The mean is only somewhat affected by an increase in the upper bound. Since the standard deviation is in the denominator it will have a strong effect on the results.