Example #1
0
 def Render(self):
     """Returns pair of xs, ys suitable for plotting.
     """
     mean, std = self.mu, self.sigma
     low, high = mean - 3 * std, mean + 3 * std
     xs, ys = thinkstats2.RenderNormalCdf(mean, std, low, high)
     return xs, ys
Example #2
0
def MakeNormalModel(arrivalDelays):
    """Plot the CDF of arrival delays with a normal model.
       This is a modified copy from analytic.py
    """

    # estimate parameters: trimming outliers yields a better fit
    mu, var = thinkstats2.TrimmedMeanVar(arrivalDelays, p=0.01)
    print('Mean, Var', mu, var)

    # plot the model
    sigma = math.sqrt(var)
    print('Sigma', sigma)
    xs, ps = thinkstats2.RenderNormalCdf(mu, sigma, low=0, high=12.5)

    thinkplot.Plot(xs, ps, label='model', color='0.8')

    # plot the data
    cdf = thinkstats2.Cdf(arrivalDelays, label='data')

    thinkplot.PrePlot(1)
    thinkplot.Cdf(cdf)
    thinkplot.Save(root='NormalModel_arrivaldelay_model',
                   title='Arrival Delays',
                   xlabel='arrival delays (min)',
                   ylabel='CDF')
Example #3
0
def PlotNormalModel(sample, title="", xlabel=""):
    cdf = thinkstats2.Cdf(sample, label="actual")
    mu, var = thinkstats2.TrimmedMeanVar(sample, p=0.01)
    sigma = np.sqrt(var)
    xmin = mu - 4.0 * sigma
    xmax = mu + 4.0 * sigma
    xs, ys = thinkstats2.RenderNormalCdf(mu, sigma, xmin, xmax)
    thinkplot.Cdf(cdf)
    thinkplot.plot(xs,
                   ys,
                   label=r'model $\mu$={:.2f} $\sigma$={:.2f}'.format(
                       mu, sigma))
    thinkplot.Config(title=title, xlabel=xlabel, ylabel="CDF")
Example #4
0
def MakeNormalModel(data, label):
    cdf = thinkstats2.Cdf(data, label=label)

    mean, var = thinkstats2.TrimmedMeanVar(data)
    std = np.sqrt(var)
    print('n, mean, std', len(data), mean, std)

    xmin = mean - 4 * std
    xmax = mean + 4 * std

    xs, ps = thinkstats2.RenderNormalCdf(mean, std, xmin, xmax)
    thinkplot.Plot(xs, ps, label='model', linewidth=4, color='0.8')
    thinkplot.Cdf(cdf)
def MakeFigures():
    """Plots the CDF of populations in several forms.

    On a log-log scale the tail of the CCDF looks like a straight line,
    which suggests a Pareto distribution, but that turns out to be misleading.

    On a log-x scale the distribution has the characteristic sigmoid of
    a lognormal distribution.

    The normal probability plot of log(sizes) confirms that the data fit the
    lognormal model very well.

    Many phenomena that have been described with Pareto models can be described
    as well, or better, with lognormal models.
    """
    pops = ReadData()
    print('Number of cities/towns', len(pops))
    
    log_pops = np.log10(pops)
    cdf = thinkstats2.Cdf(pops, label='data')
    cdf_log = thinkstats2.Cdf(log_pops, label='data')

    # pareto plot
    xs, ys = thinkstats2.RenderParetoCdf(xmin=5000, alpha=1.4, low=0, high=1e7)
    thinkplot.Plot(np.log10(xs), 1-ys, label='model', color='0.8')

    thinkplot.Cdf(cdf_log, complement=True) 
    thinkplot.Config(xlabel='log10 population',
                     ylabel='CCDF',
                     yscale='log')
    thinkplot.Save(root='populations_pareto')

    # lognormal plot
    thinkplot.PrePlot(cols=2)

    mu, sigma = log_pops.mean(), log_pops.std()
    xs, ps = thinkstats2.RenderNormalCdf(mu, sigma, low=0, high=8)
    thinkplot.Plot(xs, ps, label='model', color='0.8')

    thinkplot.Cdf(cdf_log) 
    thinkplot.Config(xlabel='log10 population',
                     ylabel='CDF')

    thinkplot.SubPlot(2)
    thinkstats2.NormalProbabilityPlot(log_pops, label='data')
    thinkplot.Config(xlabel='z',
                     ylabel='log10 population',
                     xlim=[-5, 5])

    thinkplot.Save(root='populations_normal')
Example #6
0
def MakeNormalModel(weights):
    """Plots a CDF with a Normal model.

    weights: sequence
    """
    cdf = thinkstats2.Cdf(weights, label='weights')

    mean, var = thinkstats2.TrimmedMeanVar(weights)
    std = math.sqrt(var)
    print('n, mean, std', len(weights), mean, std)

    xmin = mean - 4 * std
    xmax = mean + 4 * std

    xs, ps = thinkstats2.RenderNormalCdf(mean, std, xmin, xmax)
    thinkplot.Plot(xs, ps, label='model', linewidth=4, color='0.8')
    thinkplot.Cdf(cdf)
Example #7
0
def MakeNormalModel(age):
    """Plots a CDF with a Normal model.

    age: sequence
    """
    cdf = thinkstats2.Cdf(age, label='variable')

    mean, var = thinkstats2.TrimmedMeanVar(age)
    std = np.sqrt(var)
    print('n, mean, std', len(age), mean, std)

    xmin = mean - 4 * std
    xmax = mean + 4 * std

    xs, ps = thinkstats2.RenderNormalCdf(mean, std, xmin, xmax)
    thinkplot.Plot(xs, ps, label='model', linewidth=4, color='0.8')
    thinkplot.Cdf(cdf)
Example #8
0
def MakeNormalCdf():
    """Generates a plot of the normal CDF."""
    
    thinkplot.PrePlot(3)

    mus = [1.0, 2.0, 3.0]
    sigmas = [0.5, 0.4, 0.3]
    for mu, sigma in zip(mus, sigmas):
        xs, ps = thinkstats2.RenderNormalCdf(mu=mu, sigma=sigma, 
                                               low=-1.0, high=4.0)
        label = r'$\mu=%g$, $\sigma=%g$' % (mu, sigma)
        thinkplot.Plot(xs, ps, label=label)

    thinkplot.Save(root='analytic_normal_cdf',
                   title='Normal CDF',
                   xlabel='x',
                   ylabel='CDF',
                   loc=2)
Example #9
0
def MakeFigures(df):
    """Plots the CDF of income in several forms.
    """
    xs, ps = df.income.values, df.ps.values
    cdf = SmoothCdf(xs, ps, label='data')
    cdf_log = SmoothCdf(np.log10(xs), ps, label='data')

    # linear plot
    thinkplot.Cdf(cdf)
    thinkplot.Save(root='hinc_linear', xlabel='household income', ylabel='CDF')

    # pareto plot
    # for the model I chose parameters by hand to fit the tail
    xs, ys = thinkstats2.RenderParetoCdf(xmin=55000,
                                         alpha=2.5,
                                         low=0,
                                         high=250000)
    thinkplot.Plot(xs, 1 - ys, label='model', color='0.8')

    thinkplot.Cdf(cdf, complement=True)
    thinkplot.Save(root='hinc_pareto',
                   xlabel='log10 household income',
                   ylabel='CCDF',
                   xscale='log',
                   yscale='log')

    # lognormal plot
    # for the model I estimate mu and sigma using
    # percentile-based statistics
    median = cdf_log.Percentile(50)
    iqr = cdf_log.Percentile(75) - cdf_log.Percentile(25)
    std = iqr / 1.349

    # choose std to match the upper tail
    std = 0.35
    print(median, std)

    xs, ps = thinkstats2.RenderNormalCdf(median, std, low=3.5, high=5.5)
    thinkplot.Plot(xs, ps, label='model', color='0.8')

    thinkplot.Cdf(cdf_log)
    thinkplot.Save(root='hinc_normal',
                   xlabel='log10 household income',
                   ylabel='CDF')
def MakeNormalModel(weights):
    """Plot the CDF of birthweights with a normal model."""

    # estimate parameters: trimming outliers yields a better fit
    mu, var = thinkstats2.TrimmedMeanVar(weights, p=0.01)
    print('Mean, Var', mu, var)

    # plot the model
    sigma = math.sqrt(var)
    print('Sigma', sigma)
    xs, ps = thinkstats2.RenderNormalCdf(mu, sigma, low=0, high=12.5)

    thinkplot.Plot(xs, ps, label='model', color='0.8')

    # plot the data
    cdf = thinkstats2.Cdf(weights, label='data')

    thinkplot.PrePlot(1)
    thinkplot.Cdf(cdf)
    thinkplot.Save(root='analytic_birthwgt_model',
                   title='Birth weights',
                   xlabel='birth weight (lbs)',
                   ylabel='CDF')
Example #11
0
thinkplot.Cdf(cdf)
thinkplot.Show(xlabel='Parts per Million', ylabel='CDF')

#plotting a complementary CDF (CCDF) of O3
thinkplot.Cdf(cdf, complement=True)
thinkplot.Show(xlabel='minutes', ylabel='CCDF', yscale='log')

#normal CDF with a range of parameters
thinkplot.PrePlot(3)

mus = [1.0, 2.0, 3.0]  #should change to my own numbers instead
sigmas = [0.5, 0.4, 0.3]

for mu, sigma in zip(mus, sigmas):
    xs, ps = thinkstats2.RenderNormalCdf(mu=mu,
                                         sigma=sigma,
                                         low=-1.0,
                                         high=4.0)
    label = r'$\mu=%g$, $\sigma=%g$' % (mu, sigma)
    thinkplot.Plot(xs, ps, label=label)

thinkplot.Config(title='Normal CDF',
                 xlabel='x',
                 ylabel='CDF',
                 loc='upper left')
thinkplot.Show()

#Scatterplots
thinkplot.Scatter(grp_pollution_df['NO2AQI'],
                  grp_pollution_df['SO2AQI'],
                  alpha=1)
thinkplot.Config(xlabel='NO2 & SO2 AQI',
Example #12
0
                 yscale='log',
                 loc='upper right')

#%% [markdown]
# ## Normal distribution
#
# Here's what the normal CDF looks like with a range of parameters.

#%%
thinkplot.PrePlot(3)

mus = [1.0, 2.0, 3.0]
sigmas = [0.5, 0.4, 0.3]
for mu, sigma in zip(mus, sigmas):
    xs, ps = thinkstats2.RenderNormalCdf(mu=mu,
                                         sigma=sigma,
                                         low=-1.0,
                                         high=4.0)
    label = r'$\mu=%g$, $\sigma=%g$' % (mu, sigma)
    thinkplot.Plot(xs, ps, label=label)

thinkplot.Config(title='Normal CDF',
                 xlabel='x',
                 ylabel='CDF',
                 loc='upper left')

#%% [markdown]
# I'll use a normal model to fit the distribution of birth weights from the NSFG.

#%%
preg = nsfg.ReadFemPreg()
weights = preg.totalwgt_lb.dropna()
Example #13
0
thinkplot.Cdf(log_cdf, complement=True)
xmin = 5000
alpha = 1.4
xs, ys = thinkstats2.RenderParetoCdf(xmin=xmin, alpha=alpha, low=0, high=1.0e7)
thinkplot.Plot(np.log10(xs),
               1 - ys,
               label=r'model $x_m={}$  $\alpha={}$'.format(xmin, alpha))
thinkplot.Config(yscale='log', xlabel='log10 pupulation', ylabel='CCDF')

#%%
thinkplot.Cdf(log_cdf)
mu, var = thinkstats2.TrimmedMeanVar(log_pops, p=0.01)
sigma = np.sqrt(var)
xmin = mu - 4.0 * sigma
xmax = mu + 4.0 * sigma
xs, ys = thinkstats2.RenderNormalCdf(mu, sigma, xmin, xmax)
thinkplot.plot(xs,
               ys,
               label=r'model $\mu$={:.2f} $\sigma$={:.2f}'.format(mu, sigma))
thinkplot.Config(xlabel='log10 pupulation', ylabel='CDF')

#%%
PlotNormalProbability(log_pops, ylabel="log10 population")

#%% [markdown]
# ## 5.6 random

#%%
import math

Example #14
0
max(filmsdata.budget)


# In[444]:


min(filmsdata.budget)


# In[445]:


# plot the model
sigma = np.sqrt(var)
print('Sigma', sigma)
xs, ps = thinkstats2.RenderNormalCdf(mu, sigma, low=7000, high=380000000)

thinkplot.Plot(xs, ps, label='model', color='0.6')

cdf = thinkstats2.Cdf(filmsdata.budget, label='data')

thinkplot.PrePlot(1)
thinkplot.Cdf(cdf) 
thinkplot.Config(title='Film Budgets Normal Distribution',
                 xlabel='Film Budgets',
                 ylabel='CDF')


# Next, I will observe the relationship between profit and budget. I will begin by plotting a scatterplot of these two variables.

# In[446]:
Example #15
0
def CDFVisualDist(cdf):
    xs, ps = cdf.xs, cdf.ps

    # set up subplots
    PrePlot(num=6, cols=3, rows=2)

    # linear plot
    SubPlot(1)
    Cdf(cdf, color='C0')
    Config(xlabel='x', ylabel='CDF', title='Linear Plot')

    # lognormal plot
    SubPlot(2)
    xs_log = np.log10(xs)
    cdf_log = thinkstats2.Cdf(xs_log, ps, label='data')
    median = cdf_log.Percentile(50)
    iqr = thinkstats2.IQRFromCDF(cdf_log)
    std = thinkstats2.StdFromIQR(iqr)
    low = np.nanmin(xs_log[xs_log != -np.inf])
    high = np.nanmax(xs_log[xs_log != np.inf])

    x_norm, p_norm = thinkstats2.RenderNormalCdf(median,
                                                 std,
                                                 low=low,
                                                 high=high)
    Plot(x_norm, p_norm, label='model', color='0.8')

    Cdf(cdf_log, color='C0')
    Config(xlabel='log10 x', ylabel='CDF', title='Lognormal Plot')

    # pareto plot
    SubPlot(3)

    scale = Cdf(cdf, transform='pareto', color='C0')
    Config(xlabel='x', ylabel='CCDF', title='Pareto Plot', **scale)

    # exponential plot
    SubPlot(4)
    mean = cdf.NaNMean()
    lam = 1 / mean
    low = np.nanmin(xs[xs != -np.inf])
    high = np.nanmax(xs[xs != np.inf])
    expo_xs, expo_ps = thinkstats2.RenderExpoCdf(lam, low, high)
    Plot(expo_xs, 1 - expo_ps, label='model', color='0.8')
    scale = Cdf(cdf, transform='exponential', color='C0')
    Config(xlabel='x', ylabel='CCDF', title='Exponential Plot', **scale)

    # normal plot
    SubPlot(5)
    var = cdf.NaNVar()
    std = np.sqrt(var)

    low = mean - 4 * std
    high = mean + 4 * std

    norm_xs, norm_ps = thinkstats2.RenderNormalCdf(mean, std, low, high)
    Cdf(cdf, color='C0')
    Plot(norm_xs, norm_ps, label='model', linewidth=4, color='0.8')
    Config(xlabel='x', ylabel='CDF', title='Normal Plot')

    # weibull plot
    SubPlot(6)
    scale = Cdf(cdf, transform='weibull', color='C0')
    Config(title='weibull transform',
           xlabel='log x',
           ylabel='log log CCDF',
           **scale)
    Show()