Exemple #1
0
def PlotQuadraticModel(daily, name):
    """
    """
    model, results = RunQuadraticModel(daily)
    regression.SummarizeResults(results)
    timeseries.PlotFittedValues(model, results, label=name)
    thinkplot.Save(root='timeseries11',
                   title='fitted values',
                   xlabel='years',
                   xlim=[-0.1, 3.8],
                   ylabel='price per gram ($)')

    timeseries.PlotResidualPercentiles(model, results)
    thinkplot.Save(root='timeseries12',
                   title='residuals',
                   xlabel='years',
                   ylabel='price per gram ($)')

    years = np.linspace(0, 5, 101)
    thinkplot.Scatter(daily.years, daily.ppg, alpha=0.1, label=name)
    timeseries.PlotPredictions(daily, years, func=RunQuadraticModel)
    thinkplot.Save(root='timeseries13',
                   title='predictions',
                   xlabel='years',
                   xlim=[years[0] - 0.1, years[-1] + 0.1],
                   ylabel='price per gram ($)')
Exemple #2
0
def ScatterFit(xs, ys, **options):
    inter, slope = LeastSquares(xs, ys)
    fit_xs, fit_ys = FitLine(xs, inter, slope)
    thinkplot.Scatter(xs, ys, color='blue', alpha=0.1, s=10)
    thinkplot.Plot(fit_xs, fit_ys, color='white', linewidth=3)
    thinkplot.Plot(fit_xs, fit_ys, color='red', linewidth=2)
    thinkplot.Show(legend=False, **options)
def ScatterPlot(ages, weights, alpha=1.0):
    thinkplot.Scatter(ages, weights, alpha=alpha)
    thinkplot.Config(xlabel='age (years)',
                     ylabel='weight (lbs)',
                     xlim=[10, 45],
                     ylim=[0, 15],
                     legend=False)
Exemple #4
0
def PlotArrivalDepartureDelayFit(flights):
    """Plots a scatter plot and fitted curve.

    live: DataFrame
    """

    sample = thinkstats2.SampleRows(flights, 1000)
    arrivalDelays = sample.ARRIVAL_DELAY
    departureDelays = sample.DEPARTURE_DELAY
    inter, slope = thinkstats2.LeastSquares(arrivalDelays, departureDelays)
    fit_xs, fit_ys = thinkstats2.FitLine(arrivalDelays, inter, slope)

    thinkplot.Scatter(arrivalDelays, departureDelays, color='gray', alpha=0.1)
    thinkplot.Plot(fit_xs, fit_ys, color='white', linewidth=3)
    thinkplot.Plot(fit_xs, fit_ys, color='blue', linewidth=2)
    thinkplot.Save(
        root='ArrivalDepartureDelayFit_linear1',
        xlabel='arrival delay (min)',
        ylabel='departure delay (min)',
        #                   axis=[10, 45, 0, 15],
        legend=False)

    formula = 'DEPARTURE_DELAY ~ ARRIVAL_DELAY'
    model = smf.ols(formula, data=sample)
    results = model.fit()
    regression.SummarizeResults(results)
Exemple #5
0
def MakeArrivalDepartureDelayScatterPlots(flights):
    """Make scatterplots.
    """
    sample = thinkstats2.SampleRows(flights, 10000)

    # simple scatter plot
    thinkplot.PrePlot(cols=2)
    #    departureDelays, arrivalDelays = GetArrivalDepartureDelay(sample)
    #    airports = sample.AIRLINE
    #   arrivalDelays = sample.ARRIVAL_DELAY
    #    ScatterPlot(airports, arrivalDelays)

    # scatter plot with jitter
    #    thinkplot.SubPlot(2)
    departureDelays, arrivalDelays = GetArrivalDepartureDelay(sample,
                                                              hjitter=1.3,
                                                              wjitter=0.5)

    thinkplot.Scatter(arrivalDelays, departureDelays, alpha=1)
    thinkplot.Config(
        xlabel='arrival delay (min)',
        ylabel='departure delay (min)',
        #                     axis=[-20, 20, 20, 200],
        legend=False)

    thinkplot.Save(root='ArrivalDepartureDelayScatterplot')
Exemple #6
0
def PlotEwmaPredictions(daily, name):
    """
    """

    # use EWMA to estimate slopes
    filled = timeseries.FillMissing(daily)
    filled['slope'] = pandas.ewma(filled.ppg.diff(), span=180)
    filled[-1:]

    # extract the last inter and slope
    start = filled.index[-1]
    inter = filled.ewma[-1]
    slope = filled.slope[-1]

    # reindex the DataFrame, adding a year to the end
    dates = pandas.date_range(filled.index.min(),
                              filled.index.max() + np.timedelta64(365, 'D'))
    predicted = filled.reindex(dates)

    # generate predicted values and add them to the end
    predicted['date'] = predicted.index
    one_day = np.timedelta64(1, 'D')
    predicted['days'] = (predicted.date - start) / one_day
    predict = inter + slope * predicted.days
    predicted.ewma.fillna(predict, inplace=True)

    # plot the actual values and predictions
    thinkplot.Scatter(daily.ppg, alpha=0.1, label=name)
    thinkplot.Plot(predicted.ewma)
    thinkplot.Save()
Exemple #7
0
def PlotScatter(age, wgt, xmin, xmax, ymin, ymax):
    thinkplot.Scatter(age, wgt, alpha=1.0)
    thinkplot.Config(xlabel='Age (Years)',
                     ylabel='Birth Weight (lbs)',
                     xlim=[xmin, xmax],
                     ylim=[ymin, ymax],
                     legend=False)
    thinkplot.Show()
Exemple #8
0
def PlotFilled(daily, name):
    """Plots the EWMA and filled data.

    daily: DataFrame of daily prices
    """
    filled = FillMissing(daily, span=30)
    thinkplot.Scatter(filled.ppg, s=15, alpha=0.3, label=name)
    thinkplot.Plot(filled.ewma, label='EWMA', alpha=0.4)
    pyplot.xticks(rotation=30)
    thinkplot.Save(root='timeseries8', ylabel='price per gram ($)')
def PlotFittedValues(model, results, label=''):
    """Plots original data and fitted values.

    model: StatsModel model object
    results: StatsModel results object
    """
    years = model.exog[:, 1]
    values = model.endog
    thinkplot.Scatter(years, values, s=15, label=label)
    thinkplot.Plot(years, results.fittedvalues, label='model')
Exemple #10
0
def PlotSimplePrediction(results, years):
    predict = GenerateSimplePrediction(results, years)

    thinkplot.Scatter(daily.years, daily.ppg, alpha=0.2, label=name)
    thinkplot.Plot(years, predict, color='#ff7f00')
    xlim = years[0] - 0.1, years[-1] + 0.1
    thinkplot.Show(title='Predictions',
                   xlabel='Years',
                   xlim=xlim,
                   ylabel='Price per gram ($)',
                   loc='upper right')
def main(name):
    thinkstats2.RandomSeed(18)
    transactions = ReadData()

    dailies = GroupByQualityAndDay(transactions)
    PlotDailies(dailies)
    RunModels(dailies)
    PrintSerialCorrelations(dailies)
    MakeAcfPlot(dailies)

    name = 'high'
    daily = dailies[name]

    PlotLinearModel(daily, name)
    PlotRollingMean(daily, name)
    PlotFilled(daily, name)

    years = np.linspace(0, 5, 101)
    thinkplot.Scatter(daily.years, daily.ppg, alpha=0.1, label=name)
    PlotPredictions(daily, years)
    xlim = years[0] - 0.1, years[-1] + 0.1
    thinkplot.Save(root='timeseries4',
                   title='predictions',
                   xlabel='years',
                   xlim=xlim,
                   ylabel='price per gram ($)',
                   formats=FORMATS)

    name = 'medium'
    daily = dailies[name]

    thinkplot.Scatter(daily.years, daily.ppg, alpha=0.1, label=name)
    PlotIntervals(daily, years)
    PlotPredictions(daily, years)
    xlim = years[0] - 0.1, years[-1] + 0.1
    thinkplot.Save(root='timeseries5',
                   title='predictions',
                   xlabel='years',
                   xlim=xlim,
                   ylabel='price per gram ($)',
                   formats=FORMATS)
def PlotRollingMean(daily, name):
    """Plots rolling mean and EWMA.

    daily: DataFrame of daily prices
    """
    dates = pandas.date_range(daily.index.min(), daily.index.max())
    reindexed = daily.reindex(dates)

    thinkplot.PrePlot(cols=2)
    thinkplot.Scatter(reindexed.ppg, s=15, alpha=0.1, label=name)
    roll_mean = pandas.rolling_mean(reindexed.ppg, 30)
    thinkplot.Plot(roll_mean, label='rolling mean')
    pyplot.xticks(rotation=30)
    thinkplot.Config(ylabel='price per gram ($)')

    thinkplot.SubPlot(2)
    thinkplot.Scatter(reindexed.ppg, s=15, alpha=0.1, label=name)
    ewma = pandas.ewma(reindexed.ppg, span=30)
    thinkplot.Plot(ewma, label='EWMA')
    pyplot.xticks(rotation=30)
    thinkplot.Save(root='timeseries10', formats=FORMATS)
Exemple #13
0
def ScatterPlot(heights, weights, alpha=1.0):
    """Make a scatter plot and save it.

    heights: sequence of float
    weights: sequence of float
    alpha: float
    """
    thinkplot.Scatter(heights, weights, alpha=alpha)
    thinkplot.Config(xlabel='height (cm)',
                     ylabel='weight (kg)',
                     axis=[140, 210, 20, 200],
                     legend=False)
Exemple #14
0
def PlotFilled(daily, name):
    """Plot the EWMA and filled data.

    daily: DataFrame of daily prices
    name: string
    """
    filled = FillMissing(daily, span=30)
    thinkplot.Scatter(filled.ppg, s=15, alpha=0.2, label=name)
    thinkplot.Plot(filled.ewma, label='EWMA', color='#ff7f00')
    plt.xticks(rotation=30)
    thinkplot.Config(label='Price per gram ($)')
    thinkplot.Show()
def ScatterPlot(ages, weights, alpha=1.0):
    """Make a scatter plot and save it.

    ages: sequence of float
    weights: sequence of float
    alpha: float
    """
    thinkplot.Scatter(ages, weights, alpha=alpha)
    thinkplot.Config(xlabel='age (years)',
                     ylabel='weight (lbs)',
                     xlim=[10, 45],
                     ylim=[0, 15],
                     legend=False)
def ScatterPlot(root, heights, weights, alpha=1.0):
    """Make a scatter plot and save it.

    root: string filename root
    heights: sequence of float
    weights: sequence of float
    alpha: float
    """
    thinkplot.Scatter(heights, weights, alpha=alpha)
    thinkplot.Save(root=root,
                   xlabel='Height (cm)',
                   ylabel='Weight (kg)',
                   axis=[140, 210, 20, 200],
                   legend=False)
Exemple #17
0
def PlotRollingMean(daily, name):
    """Plots rolling mean.

    daily: DataFrame of daily prices
    name: string
    """
    dates = pd.date_range(daily.index.min(), daily.index.max())
    reindexed = daily.reindex(dates)

    thinkplot.Scatter(reindexed.ppg, s=15, alpha=0.2, label=name)
    roll_mean = reindexed.ppg.rolling(30).mean()
    thinkplot.Plot(roll_mean, label='rolling mean', color='#ff7f00')
    plt.xticks(rotation=30)
    thinkplot.Config(ylabel='price per gram ($)')
    thinkplot.Show()
def sim_pearson(perfs, p1, p2):
    """
    皮尔逊相关系数(Pearson correlation coefficient)
    cov(X, Y) / sigmaX*sigmaY
    协方差(X,Y) / X的标准方差*Y的标准方差
    """
    shared_items = {}
    for item in perfs[p1]:
        if item in perfs[p2]:
            shared_items[item] = 1

    n = len(shared_items)

    if n == 0: return 0 

    # p1, p2共同的影评数据
    data_p1 = [perfs[p1][it] for it in shared_items]
    data_p2 = [perfs[p2][it] for it in shared_items]

    # 计算影评均值
    mu_p1 = sum(data_p1) / n
    mu_p2 = sum(data_p2) / n
    #  print(mu_p1, mu_p2)

    # 计算标准方差
    var_p1 = sum([pow(it-mu_p1, 2) for it in data_p1]) / n
    var_p2 = sum([pow(it-mu_p2, 2) for it in data_p2]) / n
    #  print(var_p1, var_p2)

    if var_p1 == 0 or var_p2 == 0: return 0

    # 计算协方差
    cov = sum([(x-mu_p1)*(y-mu_p2) for x, y in zip(data_p1, data_p2)]) / n
    #  print(cov)

    # 计算皮尔逊相关系数
    r = cov / sqrt(var_p1*var_p2)

    # ============  thinkstat 方法 ===============

    if show:
        rr = correlation.Corr(data_p1, data_p2)
        print(r, rr)
        thinkplot.Clf()
        thinkplot.Scatter(data_p1, data_p2)
        thinkplot.Show()
    
    return r
Exemple #19
0
def main(name, data_dir='.'):
    xs, ys = ReadData(data_dir)

    thinkplot.Scatter(xs, ys, alpha=0.05)
    thinkplot.Save(root='correlate1',
                   xlabel='Age (years)',
                   ylabel='Birth weight (oz)',
                   axis=[9, 45, 0, 250])

    print 'Pearson', thinkstats2.Corr(xs, ys)
    print 'Spearman', thinkstats2.SpearmanCorr(xs, ys)

    for i in range(10):
        print SimulateNull(list(xs), list(ys))

    print PValue(xs, ys, 1000)
def scatter(x):
    tot_crimes = df.Total_crimes
    thinkplot.Scatter(df[x], tot_crimes, alpha=.5)
    if x == 'month':
        thinkplot.Show(title="Total Crimes vs Time",
                       xlabel="Year",
                       ylabel="Total Crimes")
    else:
        thinkplot.Show(title="Total Crimes vs " + x + " Crimes",
                       xlabel=x + " Crimes",
                       ylabel="Total Crimes")
        print(x + " crime stats")
        print("Spearman's correlation:",
              thinkstats2.SpearmanCorr(tot_crimes, df[x]))
        print("Covariance:", thinkstats2.Cov(tot_crimes, df[x]))
        print()
def PlotDailies(dailies):
    """Makes a plot with daily prices for different qualities.

    dailies: map from name to DataFrame
    """
    thinkplot.PrePlot(rows=3)
    for i, (name, daily) in enumerate(dailies.items()):
        thinkplot.SubPlot(i + 1)
        title = 'price per gram ($)' if i == 0 else ''
        thinkplot.Config(ylim=[0, 20], title=title)
        thinkplot.Scatter(daily.ppg, s=10, label=name)
        if i == 2:
            pyplot.xticks(rotation=30)
        else:
            thinkplot.Config(xticks=[])

    thinkplot.Save(root='timeseries1', formats=FORMATS)
Exemple #22
0
def PlotFit(live):
    """Plots a scatter plot and fitted curve.

    live: DataFrame
    """
    ages = live.agepreg
    weights = live.totalwgt_lb
    inter, slope = thinkstats2.LeastSquares(ages, weights)
    fit_xs, fit_ys = thinkstats2.FitLine(ages, inter, slope)

    thinkplot.Scatter(ages, weights, color='gray', alpha=0.1)
    thinkplot.Plot(fit_xs, fit_ys, color='white', linewidth=3)
    thinkplot.Plot(fit_xs, fit_ys, color='blue', linewidth=2)
    thinkplot.Save(root='linear1',
                   xlabel='age (years)',
                   ylabel='birth weight (lbs)',
                   axis=[10, 45, 0, 15],
                   legend=False)
Exemple #23
0
def MakeFigures(pool, firsts, others):
    """Creates several figures for the book."""

    # CDF of all ages
    thinkplot.Clf()
    thinkplot.Cdf(pool.age_cdf)
    thinkplot.Save(root='agemodel_age_cdf',
                title="Distribution of mother's age",
                xlabel='age (years)',
                ylabel='CDF',
                legend=False)

    # CDF of all weights
    thinkplot.Clf()
    thinkplot.Cdf(pool.weight_cdf)
    thinkplot.Save(root='agemodel_weight_cdf',
                title="Distribution of birth weight",
                xlabel='birth weight (oz)',
                ylabel='CDF',
                legend=False)

    # plot CDFs of birth ages for first babies and others
    thinkplot.Clf()
    thinkplot.Cdfs([firsts.age_cdf, others.age_cdf])
    thinkplot.Save(root='agemodel_age_cdfs',
                title="Distribution of mother's age",
                xlabel='age (years)',
                ylabel='CDF')

    thinkplot.Clf()
    thinkplot.Cdfs([firsts.weight_cdf, others.weight_cdf])
    thinkplot.Save(root='agemodel_weight_cdfs',
                title="Distribution of birth weight",
                xlabel='birth weight (oz)',
                ylabel='CDF')

    # make a scatterplot of ages and weights
    ages, weights = GetAgeWeight(pool)
    thinkplot.clf()
    thinkplot.Scatter(ages, weights, alpha=0.2)
    thinkplot.Save(root='agemodel_scatter',
                xlabel='Age (years)',
                ylabel='Birth weight (oz)',
                legend=False)
Exemple #24
0
def main():
    random.seed(17)

    rho = -0.8
    res = CorrelatedGenerator(1000, rho)
    xs, ys = zip(*res)

    a = 1.0
    b = 0.0
    xs = [a * x + b for x in xs]

    print 'mean, var of x', thinkstats2.MeanVar(xs)
    print 'mean, var of y', thinkstats2.MeanVar(ys)
    print 'covariance', thinkstats2.Cov(xs, ys)
    print 'Pearson corr', thinkstats2.Corr(xs, ys)
    print 'Spearman corr', thinkstats2.SpearmanCorr(xs, ys)

    thinkplot.Scatter(xs, ys)
    thinkplot.Show()
Exemple #25
0
def main():
    random.seed(17)

    rho = 0.8
    xs, ys = SatIqData(1000, rho)
    print 'mean, var of x', thinkstats2.MeanVar(xs)
    print 'mean, var of y', thinkstats2.MeanVar(ys)
    print 'Pearson corr', thinkstats2.Corr(xs, ys)

    inter, slope = thinkstats2.LeastSquares(xs, ys)
    print 'inter', inter
    print 'slope', slope

    fxs, fys = thinkstats2.FitLine(xs, inter, slope)
    res = thinkstats2.Residuals(xs, ys, inter, slope)
    R2 = thinkstats2.CoefDetermination(ys, res)
    print 'R2', R2

    thinkplot.Plot(fxs, fys, color='gray', alpha=0.2)
    thinkplot.Scatter(xs, ys)
    thinkplot.Show()
def PlotQuadraticModel(daily, name):
    model, results = RunQuadraticModel(daily)
    regression.SummarizeResults(results)
    timeseries.PlotFittedValues(model, results, label=name)
    thinkplot.Save(root='Output_Timeseries1',
                   title='Fitted Val',
                   xlabel='yr',
                   xlim=[-0.2, 4],
                   ylabel='price per gram ($)')

    timeseries.PlotResidualPercentiles(model, results)
    thinkplot.Save(root='Output_Timeseries2',
                   title='Residual',
                   xlabel='yr',
                   ylabel='price per gram ($)')

    years = np.linspace(0, 10, 200)
    thinkplot.Scatter(daily.years, daily.ppg, alpha=0.1, label=name)
    timeseries.PlotPredictions(daily, years, func=RunQuadraticModel)
    thinkplot.Save(root='Output_Timeseries3',
                   title='Predict',
                   xlabel='yr',
                   xlim=[years[0]-0.1, years[-1]+0.1],
                   ylabel='price per gram ($)')
Exemple #27
0
#%%
# plot fitted values
timeseries.PlotFittedValues(model, results, label=name)
thinkplot.Config(title='Fitted Values',
                 xlabel='years',
                 xlim=[-0.1, 3.8],
                 ylabel='price ($)/gram')

#%%
# plot predictions

# set linear spacing of years
years = np.linspace(0, 5, 101)

thinkplot.Scatter(daily.years, daily.ppg, alpha=0.1, label=name)
timeseries.PlotPredictions(daily, years, func=RunQuadraticModel)

thinkplot.Config(title='predictions',
                 xlabel='years',
                 xlim=[years[0] - 0.1, years[-1] + 0.1],
                 ylabel='price ($)/gram')

#%% [markdown]
# ### Exercise 12.2
# Write a definition for a class named `SerialCorrelationTest` that extends `HypothesisTest` from Section 9.2. It should take a series and a lag as data, compute the serial correlation of the series with the given lag, and then compute the p-value of the observed correlation.
#
# Use this class to test whether the serial correlation in raw price data is statistically significant. Also test the residuals of the linear model and (if you did the previous exercise), the quadratic model.


#%%
Exemple #28
0
cdf = thinkstats2.Cdf(df.Age)
thinkplot.Cdf(cdf)
thinkplot.Config(xlabel='Age', ylabel='CDF')

#plot normal distribution
mean = df.Age.mean()
std = df.Age.std()
xs = [-4, 4]
fxs, fys = thinkstats2.FitLine(xs, inter=mean, slope=std)
thinkplot.Plot(fxs, fys, color='gray', label='model')
xs, ys = thinkstats2.NormalProbability(df.Age)
thinkplot.Plot(xs, ys, label='Age')

#scatter plots and correlation
#year vs. age
year = thinkstats2.Jitter(df.Year, .25)
thinkplot.Scatter(year, df.Age)
thinkplot.Show(xlabel='Year', ylabel='Age')
thinkstats2.Corr(df.Year, df.Age)
#drug vs. age
thinkplot.Scatter(df.Age, df.Drug)
thinkplot.Show(xlabel='Age', ylabel='Drug')

#testing a difference in gender
data = male.Age.values, female.Age.values
ht = DiffMeansPermute(data)
pvalue = ht.PValue()
print(pvalue)
ht.PlotCdf()
thinkplot.Config(xlabel='test statistic', ylabel='CDF')
Exemple #29
0
# imports
from __future__ import print_function, division
%matplotlib inline
import numpy as np
import thinkstats2
import thinkplot

# get data
import first
live, firsts, others = first.MakeFrames()
live = live.dropna(subset=['agepreg', 'totalwgt_lb'])
ages = live.agepreg
weights = live.totalwgt_lb

# make a scatter plot of birth weight versus mother’s age
thinkplot.Scatter(ages, weights, alpha=1, s=10)
thinkplot.Config(xlabel='Age (years)',
                 ylabel='Weight (lbs)',
                 xlim=[10, 45],
                 ylim=[0, 15],
                 legend=False)
# RESULTS: messy plot

# Plot percentiles of birth weight versus mother’s age
bins = np.arange(10, 45, 5)
indices = np.digitize(live.agepreg, bins)
groups = live.groupby(indices)
# binned mother's age

mean_ages = [group.agepreg.mean() for i, group in groups]
cdfs_wgt= [thinkstats2.Cdf(group.totalwgt_lb) for i, group in groups]
        meanx = np.mean(xs)
    if meany is None:
        meany = np.mean(ys)

    cov = np.dot(xs-meanx, ys-meany) / len(xs)
    return cov


# In[65]:

def Jitter(values, jitter=0.5): # the Jitter is plotted using the Jitter module and scatter
    n = len(values)           
    return np.random.normal(0, jitter, n) + values
heights = Jitter(wins, 1.4)
weights = Jitter(runs, 0.5)
thinkplot.Scatter(heights, weights, alpha=0.9, s=40,color='blue')
thinkplot.Config(xlabel='Wins',
                 ylabel='Runs',
                 axis=[0, 210, 20, 200],
                 legend=False)


# In[66]:

# In the Fifth part 
# the covariance is needed in order to compute the correlation
Cov(wins_sample, runs_sample)


# In[67]: