def PlotQuadraticModel(daily, name): """ """ model, results = RunQuadraticModel(daily) regression.SummarizeResults(results) timeseries.PlotFittedValues(model, results, label=name) thinkplot.Save(root='timeseries11', title='fitted values', xlabel='years', xlim=[-0.1, 3.8], ylabel='price per gram ($)') timeseries.PlotResidualPercentiles(model, results) thinkplot.Save(root='timeseries12', title='residuals', xlabel='years', ylabel='price per gram ($)') years = np.linspace(0, 5, 101) thinkplot.Scatter(daily.years, daily.ppg, alpha=0.1, label=name) timeseries.PlotPredictions(daily, years, func=RunQuadraticModel) thinkplot.Save(root='timeseries13', title='predictions', xlabel='years', xlim=[years[0] - 0.1, years[-1] + 0.1], ylabel='price per gram ($)')
def ScatterFit(xs, ys, **options): inter, slope = LeastSquares(xs, ys) fit_xs, fit_ys = FitLine(xs, inter, slope) thinkplot.Scatter(xs, ys, color='blue', alpha=0.1, s=10) thinkplot.Plot(fit_xs, fit_ys, color='white', linewidth=3) thinkplot.Plot(fit_xs, fit_ys, color='red', linewidth=2) thinkplot.Show(legend=False, **options)
def ScatterPlot(ages, weights, alpha=1.0): thinkplot.Scatter(ages, weights, alpha=alpha) thinkplot.Config(xlabel='age (years)', ylabel='weight (lbs)', xlim=[10, 45], ylim=[0, 15], legend=False)
def PlotArrivalDepartureDelayFit(flights): """Plots a scatter plot and fitted curve. live: DataFrame """ sample = thinkstats2.SampleRows(flights, 1000) arrivalDelays = sample.ARRIVAL_DELAY departureDelays = sample.DEPARTURE_DELAY inter, slope = thinkstats2.LeastSquares(arrivalDelays, departureDelays) fit_xs, fit_ys = thinkstats2.FitLine(arrivalDelays, inter, slope) thinkplot.Scatter(arrivalDelays, departureDelays, color='gray', alpha=0.1) thinkplot.Plot(fit_xs, fit_ys, color='white', linewidth=3) thinkplot.Plot(fit_xs, fit_ys, color='blue', linewidth=2) thinkplot.Save( root='ArrivalDepartureDelayFit_linear1', xlabel='arrival delay (min)', ylabel='departure delay (min)', # axis=[10, 45, 0, 15], legend=False) formula = 'DEPARTURE_DELAY ~ ARRIVAL_DELAY' model = smf.ols(formula, data=sample) results = model.fit() regression.SummarizeResults(results)
def MakeArrivalDepartureDelayScatterPlots(flights): """Make scatterplots. """ sample = thinkstats2.SampleRows(flights, 10000) # simple scatter plot thinkplot.PrePlot(cols=2) # departureDelays, arrivalDelays = GetArrivalDepartureDelay(sample) # airports = sample.AIRLINE # arrivalDelays = sample.ARRIVAL_DELAY # ScatterPlot(airports, arrivalDelays) # scatter plot with jitter # thinkplot.SubPlot(2) departureDelays, arrivalDelays = GetArrivalDepartureDelay(sample, hjitter=1.3, wjitter=0.5) thinkplot.Scatter(arrivalDelays, departureDelays, alpha=1) thinkplot.Config( xlabel='arrival delay (min)', ylabel='departure delay (min)', # axis=[-20, 20, 20, 200], legend=False) thinkplot.Save(root='ArrivalDepartureDelayScatterplot')
def PlotEwmaPredictions(daily, name): """ """ # use EWMA to estimate slopes filled = timeseries.FillMissing(daily) filled['slope'] = pandas.ewma(filled.ppg.diff(), span=180) filled[-1:] # extract the last inter and slope start = filled.index[-1] inter = filled.ewma[-1] slope = filled.slope[-1] # reindex the DataFrame, adding a year to the end dates = pandas.date_range(filled.index.min(), filled.index.max() + np.timedelta64(365, 'D')) predicted = filled.reindex(dates) # generate predicted values and add them to the end predicted['date'] = predicted.index one_day = np.timedelta64(1, 'D') predicted['days'] = (predicted.date - start) / one_day predict = inter + slope * predicted.days predicted.ewma.fillna(predict, inplace=True) # plot the actual values and predictions thinkplot.Scatter(daily.ppg, alpha=0.1, label=name) thinkplot.Plot(predicted.ewma) thinkplot.Save()
def PlotScatter(age, wgt, xmin, xmax, ymin, ymax): thinkplot.Scatter(age, wgt, alpha=1.0) thinkplot.Config(xlabel='Age (Years)', ylabel='Birth Weight (lbs)', xlim=[xmin, xmax], ylim=[ymin, ymax], legend=False) thinkplot.Show()
def PlotFilled(daily, name): """Plots the EWMA and filled data. daily: DataFrame of daily prices """ filled = FillMissing(daily, span=30) thinkplot.Scatter(filled.ppg, s=15, alpha=0.3, label=name) thinkplot.Plot(filled.ewma, label='EWMA', alpha=0.4) pyplot.xticks(rotation=30) thinkplot.Save(root='timeseries8', ylabel='price per gram ($)')
def PlotFittedValues(model, results, label=''): """Plots original data and fitted values. model: StatsModel model object results: StatsModel results object """ years = model.exog[:, 1] values = model.endog thinkplot.Scatter(years, values, s=15, label=label) thinkplot.Plot(years, results.fittedvalues, label='model')
def PlotSimplePrediction(results, years): predict = GenerateSimplePrediction(results, years) thinkplot.Scatter(daily.years, daily.ppg, alpha=0.2, label=name) thinkplot.Plot(years, predict, color='#ff7f00') xlim = years[0] - 0.1, years[-1] + 0.1 thinkplot.Show(title='Predictions', xlabel='Years', xlim=xlim, ylabel='Price per gram ($)', loc='upper right')
def main(name): thinkstats2.RandomSeed(18) transactions = ReadData() dailies = GroupByQualityAndDay(transactions) PlotDailies(dailies) RunModels(dailies) PrintSerialCorrelations(dailies) MakeAcfPlot(dailies) name = 'high' daily = dailies[name] PlotLinearModel(daily, name) PlotRollingMean(daily, name) PlotFilled(daily, name) years = np.linspace(0, 5, 101) thinkplot.Scatter(daily.years, daily.ppg, alpha=0.1, label=name) PlotPredictions(daily, years) xlim = years[0] - 0.1, years[-1] + 0.1 thinkplot.Save(root='timeseries4', title='predictions', xlabel='years', xlim=xlim, ylabel='price per gram ($)', formats=FORMATS) name = 'medium' daily = dailies[name] thinkplot.Scatter(daily.years, daily.ppg, alpha=0.1, label=name) PlotIntervals(daily, years) PlotPredictions(daily, years) xlim = years[0] - 0.1, years[-1] + 0.1 thinkplot.Save(root='timeseries5', title='predictions', xlabel='years', xlim=xlim, ylabel='price per gram ($)', formats=FORMATS)
def PlotRollingMean(daily, name): """Plots rolling mean and EWMA. daily: DataFrame of daily prices """ dates = pandas.date_range(daily.index.min(), daily.index.max()) reindexed = daily.reindex(dates) thinkplot.PrePlot(cols=2) thinkplot.Scatter(reindexed.ppg, s=15, alpha=0.1, label=name) roll_mean = pandas.rolling_mean(reindexed.ppg, 30) thinkplot.Plot(roll_mean, label='rolling mean') pyplot.xticks(rotation=30) thinkplot.Config(ylabel='price per gram ($)') thinkplot.SubPlot(2) thinkplot.Scatter(reindexed.ppg, s=15, alpha=0.1, label=name) ewma = pandas.ewma(reindexed.ppg, span=30) thinkplot.Plot(ewma, label='EWMA') pyplot.xticks(rotation=30) thinkplot.Save(root='timeseries10', formats=FORMATS)
def ScatterPlot(heights, weights, alpha=1.0): """Make a scatter plot and save it. heights: sequence of float weights: sequence of float alpha: float """ thinkplot.Scatter(heights, weights, alpha=alpha) thinkplot.Config(xlabel='height (cm)', ylabel='weight (kg)', axis=[140, 210, 20, 200], legend=False)
def PlotFilled(daily, name): """Plot the EWMA and filled data. daily: DataFrame of daily prices name: string """ filled = FillMissing(daily, span=30) thinkplot.Scatter(filled.ppg, s=15, alpha=0.2, label=name) thinkplot.Plot(filled.ewma, label='EWMA', color='#ff7f00') plt.xticks(rotation=30) thinkplot.Config(label='Price per gram ($)') thinkplot.Show()
def ScatterPlot(ages, weights, alpha=1.0): """Make a scatter plot and save it. ages: sequence of float weights: sequence of float alpha: float """ thinkplot.Scatter(ages, weights, alpha=alpha) thinkplot.Config(xlabel='age (years)', ylabel='weight (lbs)', xlim=[10, 45], ylim=[0, 15], legend=False)
def ScatterPlot(root, heights, weights, alpha=1.0): """Make a scatter plot and save it. root: string filename root heights: sequence of float weights: sequence of float alpha: float """ thinkplot.Scatter(heights, weights, alpha=alpha) thinkplot.Save(root=root, xlabel='Height (cm)', ylabel='Weight (kg)', axis=[140, 210, 20, 200], legend=False)
def PlotRollingMean(daily, name): """Plots rolling mean. daily: DataFrame of daily prices name: string """ dates = pd.date_range(daily.index.min(), daily.index.max()) reindexed = daily.reindex(dates) thinkplot.Scatter(reindexed.ppg, s=15, alpha=0.2, label=name) roll_mean = reindexed.ppg.rolling(30).mean() thinkplot.Plot(roll_mean, label='rolling mean', color='#ff7f00') plt.xticks(rotation=30) thinkplot.Config(ylabel='price per gram ($)') thinkplot.Show()
def sim_pearson(perfs, p1, p2): """ 皮尔逊相关系数(Pearson correlation coefficient) cov(X, Y) / sigmaX*sigmaY 协方差(X,Y) / X的标准方差*Y的标准方差 """ shared_items = {} for item in perfs[p1]: if item in perfs[p2]: shared_items[item] = 1 n = len(shared_items) if n == 0: return 0 # p1, p2共同的影评数据 data_p1 = [perfs[p1][it] for it in shared_items] data_p2 = [perfs[p2][it] for it in shared_items] # 计算影评均值 mu_p1 = sum(data_p1) / n mu_p2 = sum(data_p2) / n # print(mu_p1, mu_p2) # 计算标准方差 var_p1 = sum([pow(it-mu_p1, 2) for it in data_p1]) / n var_p2 = sum([pow(it-mu_p2, 2) for it in data_p2]) / n # print(var_p1, var_p2) if var_p1 == 0 or var_p2 == 0: return 0 # 计算协方差 cov = sum([(x-mu_p1)*(y-mu_p2) for x, y in zip(data_p1, data_p2)]) / n # print(cov) # 计算皮尔逊相关系数 r = cov / sqrt(var_p1*var_p2) # ============ thinkstat 方法 =============== if show: rr = correlation.Corr(data_p1, data_p2) print(r, rr) thinkplot.Clf() thinkplot.Scatter(data_p1, data_p2) thinkplot.Show() return r
def main(name, data_dir='.'): xs, ys = ReadData(data_dir) thinkplot.Scatter(xs, ys, alpha=0.05) thinkplot.Save(root='correlate1', xlabel='Age (years)', ylabel='Birth weight (oz)', axis=[9, 45, 0, 250]) print 'Pearson', thinkstats2.Corr(xs, ys) print 'Spearman', thinkstats2.SpearmanCorr(xs, ys) for i in range(10): print SimulateNull(list(xs), list(ys)) print PValue(xs, ys, 1000)
def scatter(x): tot_crimes = df.Total_crimes thinkplot.Scatter(df[x], tot_crimes, alpha=.5) if x == 'month': thinkplot.Show(title="Total Crimes vs Time", xlabel="Year", ylabel="Total Crimes") else: thinkplot.Show(title="Total Crimes vs " + x + " Crimes", xlabel=x + " Crimes", ylabel="Total Crimes") print(x + " crime stats") print("Spearman's correlation:", thinkstats2.SpearmanCorr(tot_crimes, df[x])) print("Covariance:", thinkstats2.Cov(tot_crimes, df[x])) print()
def PlotDailies(dailies): """Makes a plot with daily prices for different qualities. dailies: map from name to DataFrame """ thinkplot.PrePlot(rows=3) for i, (name, daily) in enumerate(dailies.items()): thinkplot.SubPlot(i + 1) title = 'price per gram ($)' if i == 0 else '' thinkplot.Config(ylim=[0, 20], title=title) thinkplot.Scatter(daily.ppg, s=10, label=name) if i == 2: pyplot.xticks(rotation=30) else: thinkplot.Config(xticks=[]) thinkplot.Save(root='timeseries1', formats=FORMATS)
def PlotFit(live): """Plots a scatter plot and fitted curve. live: DataFrame """ ages = live.agepreg weights = live.totalwgt_lb inter, slope = thinkstats2.LeastSquares(ages, weights) fit_xs, fit_ys = thinkstats2.FitLine(ages, inter, slope) thinkplot.Scatter(ages, weights, color='gray', alpha=0.1) thinkplot.Plot(fit_xs, fit_ys, color='white', linewidth=3) thinkplot.Plot(fit_xs, fit_ys, color='blue', linewidth=2) thinkplot.Save(root='linear1', xlabel='age (years)', ylabel='birth weight (lbs)', axis=[10, 45, 0, 15], legend=False)
def MakeFigures(pool, firsts, others): """Creates several figures for the book.""" # CDF of all ages thinkplot.Clf() thinkplot.Cdf(pool.age_cdf) thinkplot.Save(root='agemodel_age_cdf', title="Distribution of mother's age", xlabel='age (years)', ylabel='CDF', legend=False) # CDF of all weights thinkplot.Clf() thinkplot.Cdf(pool.weight_cdf) thinkplot.Save(root='agemodel_weight_cdf', title="Distribution of birth weight", xlabel='birth weight (oz)', ylabel='CDF', legend=False) # plot CDFs of birth ages for first babies and others thinkplot.Clf() thinkplot.Cdfs([firsts.age_cdf, others.age_cdf]) thinkplot.Save(root='agemodel_age_cdfs', title="Distribution of mother's age", xlabel='age (years)', ylabel='CDF') thinkplot.Clf() thinkplot.Cdfs([firsts.weight_cdf, others.weight_cdf]) thinkplot.Save(root='agemodel_weight_cdfs', title="Distribution of birth weight", xlabel='birth weight (oz)', ylabel='CDF') # make a scatterplot of ages and weights ages, weights = GetAgeWeight(pool) thinkplot.clf() thinkplot.Scatter(ages, weights, alpha=0.2) thinkplot.Save(root='agemodel_scatter', xlabel='Age (years)', ylabel='Birth weight (oz)', legend=False)
def main(): random.seed(17) rho = -0.8 res = CorrelatedGenerator(1000, rho) xs, ys = zip(*res) a = 1.0 b = 0.0 xs = [a * x + b for x in xs] print 'mean, var of x', thinkstats2.MeanVar(xs) print 'mean, var of y', thinkstats2.MeanVar(ys) print 'covariance', thinkstats2.Cov(xs, ys) print 'Pearson corr', thinkstats2.Corr(xs, ys) print 'Spearman corr', thinkstats2.SpearmanCorr(xs, ys) thinkplot.Scatter(xs, ys) thinkplot.Show()
def main(): random.seed(17) rho = 0.8 xs, ys = SatIqData(1000, rho) print 'mean, var of x', thinkstats2.MeanVar(xs) print 'mean, var of y', thinkstats2.MeanVar(ys) print 'Pearson corr', thinkstats2.Corr(xs, ys) inter, slope = thinkstats2.LeastSquares(xs, ys) print 'inter', inter print 'slope', slope fxs, fys = thinkstats2.FitLine(xs, inter, slope) res = thinkstats2.Residuals(xs, ys, inter, slope) R2 = thinkstats2.CoefDetermination(ys, res) print 'R2', R2 thinkplot.Plot(fxs, fys, color='gray', alpha=0.2) thinkplot.Scatter(xs, ys) thinkplot.Show()
def PlotQuadraticModel(daily, name): model, results = RunQuadraticModel(daily) regression.SummarizeResults(results) timeseries.PlotFittedValues(model, results, label=name) thinkplot.Save(root='Output_Timeseries1', title='Fitted Val', xlabel='yr', xlim=[-0.2, 4], ylabel='price per gram ($)') timeseries.PlotResidualPercentiles(model, results) thinkplot.Save(root='Output_Timeseries2', title='Residual', xlabel='yr', ylabel='price per gram ($)') years = np.linspace(0, 10, 200) thinkplot.Scatter(daily.years, daily.ppg, alpha=0.1, label=name) timeseries.PlotPredictions(daily, years, func=RunQuadraticModel) thinkplot.Save(root='Output_Timeseries3', title='Predict', xlabel='yr', xlim=[years[0]-0.1, years[-1]+0.1], ylabel='price per gram ($)')
#%% # plot fitted values timeseries.PlotFittedValues(model, results, label=name) thinkplot.Config(title='Fitted Values', xlabel='years', xlim=[-0.1, 3.8], ylabel='price ($)/gram') #%% # plot predictions # set linear spacing of years years = np.linspace(0, 5, 101) thinkplot.Scatter(daily.years, daily.ppg, alpha=0.1, label=name) timeseries.PlotPredictions(daily, years, func=RunQuadraticModel) thinkplot.Config(title='predictions', xlabel='years', xlim=[years[0] - 0.1, years[-1] + 0.1], ylabel='price ($)/gram') #%% [markdown] # ### Exercise 12.2 # Write a definition for a class named `SerialCorrelationTest` that extends `HypothesisTest` from Section 9.2. It should take a series and a lag as data, compute the serial correlation of the series with the given lag, and then compute the p-value of the observed correlation. # # Use this class to test whether the serial correlation in raw price data is statistically significant. Also test the residuals of the linear model and (if you did the previous exercise), the quadratic model. #%%
cdf = thinkstats2.Cdf(df.Age) thinkplot.Cdf(cdf) thinkplot.Config(xlabel='Age', ylabel='CDF') #plot normal distribution mean = df.Age.mean() std = df.Age.std() xs = [-4, 4] fxs, fys = thinkstats2.FitLine(xs, inter=mean, slope=std) thinkplot.Plot(fxs, fys, color='gray', label='model') xs, ys = thinkstats2.NormalProbability(df.Age) thinkplot.Plot(xs, ys, label='Age') #scatter plots and correlation #year vs. age year = thinkstats2.Jitter(df.Year, .25) thinkplot.Scatter(year, df.Age) thinkplot.Show(xlabel='Year', ylabel='Age') thinkstats2.Corr(df.Year, df.Age) #drug vs. age thinkplot.Scatter(df.Age, df.Drug) thinkplot.Show(xlabel='Age', ylabel='Drug') #testing a difference in gender data = male.Age.values, female.Age.values ht = DiffMeansPermute(data) pvalue = ht.PValue() print(pvalue) ht.PlotCdf() thinkplot.Config(xlabel='test statistic', ylabel='CDF')
# imports from __future__ import print_function, division %matplotlib inline import numpy as np import thinkstats2 import thinkplot # get data import first live, firsts, others = first.MakeFrames() live = live.dropna(subset=['agepreg', 'totalwgt_lb']) ages = live.agepreg weights = live.totalwgt_lb # make a scatter plot of birth weight versus mother’s age thinkplot.Scatter(ages, weights, alpha=1, s=10) thinkplot.Config(xlabel='Age (years)', ylabel='Weight (lbs)', xlim=[10, 45], ylim=[0, 15], legend=False) # RESULTS: messy plot # Plot percentiles of birth weight versus mother’s age bins = np.arange(10, 45, 5) indices = np.digitize(live.agepreg, bins) groups = live.groupby(indices) # binned mother's age mean_ages = [group.agepreg.mean() for i, group in groups] cdfs_wgt= [thinkstats2.Cdf(group.totalwgt_lb) for i, group in groups]
meanx = np.mean(xs) if meany is None: meany = np.mean(ys) cov = np.dot(xs-meanx, ys-meany) / len(xs) return cov # In[65]: def Jitter(values, jitter=0.5): # the Jitter is plotted using the Jitter module and scatter n = len(values) return np.random.normal(0, jitter, n) + values heights = Jitter(wins, 1.4) weights = Jitter(runs, 0.5) thinkplot.Scatter(heights, weights, alpha=0.9, s=40,color='blue') thinkplot.Config(xlabel='Wins', ylabel='Runs', axis=[0, 210, 20, 200], legend=False) # In[66]: # In the Fifth part # the covariance is needed in order to compute the correlation Cov(wins_sample, runs_sample) # In[67]: