Example #1
0
def plot_df_cdfs(df, fpath, return_plot=False):
    fig = PdfPages(fpath + '.pdf')
    ax = plt.subplot(111)
    logx = False
    for col in df.names:
        col_cdf = shelp.get_cdf(df[col])
        col_cdf.plot(label=col)
        if col_cdf.index.values.std() > co.LOGX_LIM:
            logx = True
    if logx:
        ax.semilogx()
    ax.set_ylabel(printing.get_ylabel( 'cdf' ))
    ax.legend(loc='best')
   
    if return_plot:
        return fig
    else:
        fig.savefig()
        fig.close()
        plt.close()
Example #2
0
    def model(self, pred, formula=None, lag=0, reg=co.REGR_OLS, steps=30):
        if np.isnan(self.md.STD):
            print 'WARNING: modeling', self.md.series.name, 'skipped. Appears \
                    empty.'
            return None
        if self.md.STD == 0.:
            print 'WARNING: modeling', self.md.series.name, 'skipped. Appears \
                    constant.'
            return None

        if type(pred) == pan.DataFrame:
            model = MultipleModel(
                    self.md.series, 
                    pred, 
                    self.tpath, 
                    formula,
                    self.md.test,
                    lag,
                    reg,
                    steps)
        elif type(pred) == pan.Series:
            model = SimpleModel(
                    self.md.series, 
                    pred, 
                    self.tpath, 
                    formula,
                    self.md.test,
                    lag, 
                    reg,
                    steps)
        else:
            raise TypeError, '\"pred\" must be pandas.Series or pandas.DataFrame'
        sys.stdout.flush()
        res, formula = model.model()

        res_writer = writing.ResultsWriter(
                self.tpath, 
                formula, 
                self.md.test,
                res, 
                lag, 
                reg,
                steps)
        #res_writer.pickle()
        res_writer.write_regr_summ()

        if self.md.series.name in printing.logxs:
        #if LHS was logarithmically transformed for the fitting we need to
        #compute the fitted CDF manually. Just taking res.fittedvalues
        #leads to high deviation between the original CDF and the fit.
            if type(pred) == pan.Series:
                coef = res.params[printing.cond_wrap_term_log10(pred.name) ] 
                fit_cdf = shelp.get_inv_cdf(shelp.get_cdf(pred)) * coef +\
                    res.params['Intercept']
                fit_cdf = shelp.get_inv_cdf(fit_cdf)
            else:
                raise NotImplementedError, 'manually computing fitted CDF if \
                predictor is a pandas.DataFrame is not implemented.'
        else:
            # for some brilliant reason the fitted values are sometimes out of 
            # order in the upper quantiles if multiple regression is applied
            # thus, we need to sort it
            res.fittedvalues.values.sort()
            fit_cdf = shelp.get_inv_cdf(res.fittedvalues)

        fit_cdf.name = self.md.series.name + ' (estimated)'
        self.md.fitted_cdf = fit_cdf
        self.md.formula = formula

        fname = ' '.join([
            formula, 
            self.md.source,
            reg,
            str(steps),
            'lag', 
            str(lag),
            ]).replace(' ', '_')
        fpath = osp.join(self.tpath, fname)
        plotting.plot_multi_cdf([self.md.cdf, self.md.fitted_cdf], 
                [self.md.series.name, self.md.fitted_cdf.name],
                fpath)

        if type(pred) == pan.Series:
            fpath = osp.join(self.tpath, fname.replace('~', 'vs'))
            plotting.plot_multi_cdf([self.md.cdf, shelp.get_cdf(pred)], 
                    [self.md.series.name, pred.name],
                    fpath)

        fitted_rvs = shelp.rvs_from_cdf(self.md.fitted_cdf)

        res_writer.write_stats(fitted_rvs.describe(percentiles=[.05, .5, .95]))

        shift_xs, shift_ys = shelp.emp_shift_os(self.md.series, fitted_rvs)
        w_band_u, w_band_l = shelp.w_band(self.md.series, fitted_rvs)
        plotting.plot_shiftfun(shift_xs, shift_ys, w_band_u, w_band_l, fpath)

        errs = shelp.model_errors_by_shiftfun(w_band_u, w_band_l)
        res_writer.write_model_errors(errs)
        #errs = shelp.dummy_errors()
        
        plotting.plot_resid_acf(res.resid, fpath)
        plotting.plot_resid_pacf(res.resid, fpath)

        #res_fit_parms_dict = dict(zip(res.params.index, res.params.values))
        #fit_meas_dict = {co.MSE: quant_mse, co.MPE: quant_mpe}
        #fit_meas_dict.update(res_fit_parms_dict)

        return (res, formula, errs)
Example #3
0
 def __init__(self, md, tpath):
     self._md = md
     self.md.cdf = shelp.get_cdf(self.md.series)
     self._tpath = tpath