def plot_df_cdfs(df, fpath, return_plot=False): fig = PdfPages(fpath + '.pdf') ax = plt.subplot(111) logx = False for col in df.names: col_cdf = shelp.get_cdf(df[col]) col_cdf.plot(label=col) if col_cdf.index.values.std() > co.LOGX_LIM: logx = True if logx: ax.semilogx() ax.set_ylabel(printing.get_ylabel( 'cdf' )) ax.legend(loc='best') if return_plot: return fig else: fig.savefig() fig.close() plt.close()
def model(self, pred, formula=None, lag=0, reg=co.REGR_OLS, steps=30): if np.isnan(self.md.STD): print 'WARNING: modeling', self.md.series.name, 'skipped. Appears \ empty.' return None if self.md.STD == 0.: print 'WARNING: modeling', self.md.series.name, 'skipped. Appears \ constant.' return None if type(pred) == pan.DataFrame: model = MultipleModel( self.md.series, pred, self.tpath, formula, self.md.test, lag, reg, steps) elif type(pred) == pan.Series: model = SimpleModel( self.md.series, pred, self.tpath, formula, self.md.test, lag, reg, steps) else: raise TypeError, '\"pred\" must be pandas.Series or pandas.DataFrame' sys.stdout.flush() res, formula = model.model() res_writer = writing.ResultsWriter( self.tpath, formula, self.md.test, res, lag, reg, steps) #res_writer.pickle() res_writer.write_regr_summ() if self.md.series.name in printing.logxs: #if LHS was logarithmically transformed for the fitting we need to #compute the fitted CDF manually. Just taking res.fittedvalues #leads to high deviation between the original CDF and the fit. if type(pred) == pan.Series: coef = res.params[printing.cond_wrap_term_log10(pred.name) ] fit_cdf = shelp.get_inv_cdf(shelp.get_cdf(pred)) * coef +\ res.params['Intercept'] fit_cdf = shelp.get_inv_cdf(fit_cdf) else: raise NotImplementedError, 'manually computing fitted CDF if \ predictor is a pandas.DataFrame is not implemented.' else: # for some brilliant reason the fitted values are sometimes out of # order in the upper quantiles if multiple regression is applied # thus, we need to sort it res.fittedvalues.values.sort() fit_cdf = shelp.get_inv_cdf(res.fittedvalues) fit_cdf.name = self.md.series.name + ' (estimated)' self.md.fitted_cdf = fit_cdf self.md.formula = formula fname = ' '.join([ formula, self.md.source, reg, str(steps), 'lag', str(lag), ]).replace(' ', '_') fpath = osp.join(self.tpath, fname) plotting.plot_multi_cdf([self.md.cdf, self.md.fitted_cdf], [self.md.series.name, self.md.fitted_cdf.name], fpath) if type(pred) == pan.Series: fpath = osp.join(self.tpath, fname.replace('~', 'vs')) plotting.plot_multi_cdf([self.md.cdf, shelp.get_cdf(pred)], [self.md.series.name, pred.name], fpath) fitted_rvs = shelp.rvs_from_cdf(self.md.fitted_cdf) res_writer.write_stats(fitted_rvs.describe(percentiles=[.05, .5, .95])) shift_xs, shift_ys = shelp.emp_shift_os(self.md.series, fitted_rvs) w_band_u, w_band_l = shelp.w_band(self.md.series, fitted_rvs) plotting.plot_shiftfun(shift_xs, shift_ys, w_band_u, w_band_l, fpath) errs = shelp.model_errors_by_shiftfun(w_band_u, w_band_l) res_writer.write_model_errors(errs) #errs = shelp.dummy_errors() plotting.plot_resid_acf(res.resid, fpath) plotting.plot_resid_pacf(res.resid, fpath) #res_fit_parms_dict = dict(zip(res.params.index, res.params.values)) #fit_meas_dict = {co.MSE: quant_mse, co.MPE: quant_mpe} #fit_meas_dict.update(res_fit_parms_dict) return (res, formula, errs)
def __init__(self, md, tpath): self._md = md self.md.cdf = shelp.get_cdf(self.md.series) self._tpath = tpath