def SimulateAutocorrelation(daily, iters=1001, nlags=40): """Resample residuals, compute autocorrelation, and plot percentiles. daily: iters: nlags: """ # run simulations t = [] for i in range(iters): filled = FillMissing(daily, span=30) resid = thinkstats2.Resample(filled.resid) acf = smtsa.acf(resid, nlags=nlags, unbiased=True)[1:] t.append(np.abs(acf)) # put the results in an array and sort the columns size = iters, len(acf) array = np.zeros(size) for i, acf in enumerate(t): array[i, ] = acf array = np.sort(array, axis=0) # find the bounds that cover 95% of the distribution high = PercentileRow(array, 97.5) low = -high lags = range(1, nlags + 1) thinkplot.FillBetween(lags, low, high, alpha=0.2, color='gray')
def GeneratePredictions(result_seq, years, add_resid=False): """Generates an array of predicted values from a list of model results. When add_resid is False, predictions represent sampling error only. When add_resid is True, they also include residual error (which is more relevant to prediction). result_seq: list of model results years: sequence of times (in years) to make predictions for add_resid: boolean, whether to add in resampled residuals returns: sequence of predictions """ n = len(years) d = dict(Intercept=np.ones(n), years=years, years2=years**2) predict_df = pandas.DataFrame(d) predict_seq = [] for fake_results in result_seq: predict = fake_results.predict(predict_df) if add_resid: predict += thinkstats2.Resample(fake_results.resid, n) predict_seq.append(predict) return predict_seq
def GeneratePredictions(result_seq, years, add_resid=False): """Generates an array of predicted values from a list of model results. When add_resid is False, predictions represent sampling error only. When add_resid is True, they also include residual error (which is more relevant to prediction). result_seq: list of model results years: sequence of times (in years) to make predictions for add_resid: boolean, whether to add in resampled residuals returns: array with one row per model result, one column per timestep """ n = len(years) d = dict(Intercept=np.ones(n), years=years, years2=years**2) predict_df = pandas.DataFrame(d) size = len(result_seq), n array = np.zeros(size) for i, fake_results in enumerate(result_seq): predict = fake_results.predict(predict_df) if add_resid: predict += thinkstats2.Resample(fake_results.resid, n) array[i, ] = predict array = np.sort(array, axis=0) return array
def FalseNegRate(data, num_runs=1000): """Computes the chance of a false negative based on resampling. data: pair of sequences num_runs: how many experiments to simulate returns: float false negative rate """ group1, group2 = data count = 0 for i in range(num_runs): sample1 = thinkstats2.Resample(group1) sample2 = thinkstats2.Resample(group2) ht = DiffMeansPermute((sample1, sample2)) p_value = ht.PValue(iters=101) if p_value > 0.05: count += 1 return count / num_runs
def SimulateAutocorrelation(daily, iters=1001, nlags=40): """Resample residuals, compute autocorrelation, and plot percentiles daily: DataFrame iters: number of simulations to run nlags: maximum lags to compute autocorrelation """ t = [] for _ in range(iters): filled = FillMissing(daily, span=30) resid = thinkstats2.Resample(filled.resid) acf = smtsa.acf(resid, nlags=nlags, unbiased=True)[1:] t.append(np.abs(acf)) high = thinkstats2.PercentileRows(t, [97.5])[0] low = -high lags = range(1, nlags + 1) thinkplot.FillBetween(lags, low, high, alpha=0.2, color='gray')
def SimulateResults(daily, iters=101, func=RunLinearModel): """Run simulations based on resampling residuals. daily: DataFrame of daily prices iters: number of simulations func: function that fits a model to the data returns: list of result objects """ _, results = func(daily) fake = daily.copy() result_seq = [] for _ in range(iters): fake.ppg = results.fittedvalues + thinkstats2.Resample(results.resid) _, fake_results = func(fake) result_seq.append(fake_results) return result_seq
def FillMissing(daily, span=30): """Fills missing values with an exponentially weighted moving average. Resulting DataFrame has new columns 'ewma' and 'resid'. daily: DataFrame of daily prices span: window size (sort of) passed to ewma returns: new DataFrame of daily prices """ dates = pandas.date_range(daily.index.min(), daily.index.max()) reindexed = daily.reindex(dates) ewma = pandas.ewma(reindexed.ppg, span=span) resid = (reindexed.ppg - ewma).dropna() fake_data = ewma + thinkstats2.Resample(resid, len(reindexed)) reindexed.ppg.fillna(fake_data, inplace=True) reindexed['ewma'] = ewma reindexed['resid'] = reindexed.ppg - ewma return reindexed
def SimulateIntervals(daily, iters=101, func=RunLinearModel): """Run simulations based on different subsets of the data. daily: DataFrame of daily prices iters: number of simulations func: function that fits a model to the data returns: list of result objects """ result_seq = [] starts = np.linspace(0, len(daily), iters).astype(int) for start in starts[:-2]: subset = daily[start:] _, results = func(subset) fake = subset.copy() for _ in range(iters): fake.ppg = (results.fittedvalues + thinkstats2.Resample(results.resid)) _, fake_results = func(fake) result_seq.append(fake_results) return result_seq