예제 #1
0
def block_bootstrap(series, n_samples, bs_type='Stationary', block_size=10):
    '''
    Computes bootstrapped samples of series.
    
    Inputs:
        series: pandas Series indexed by time
        n_samples: # bootstrapped samples to output
        bs_type ('Stationary'): type of bootstrapping to perform.
            Options include ['Stationary', 'Circular']
        block_size: # size of resampling blocks. Should be big enough to
            capture important frequencies in the series
            
    Ouput:
        DataFrame indexed by sample number and time
        
    
    '''

    # Set up list for sampled time-series
    list_samples = []

    # Stationary bootstrapping
    if bs_type == 'Stationary':
        bs = StationaryBootstrap(block_size, series)

        # Count for sample number
        count = 1
        for data in bs.bootstrap(n_samples):

            df_temp = pd.DataFrame({
                'sample': count,
                'time': series.index.values,
                'x': data[0][0]
            })
            list_samples.append(df_temp)
            count += 1

    if bs_type == 'Circular':
        bs = CircularBlockBootstrap(block_size, series)

        # Count for sample number
        count = 1
        for data in bs.bootstrap(n_samples):

            df_temp = pd.DataFrame({
                'sample': count,
                'time': series.index.values,
                'x': data[0][0]
            })
            list_samples.append(df_temp)
            count += 1

    # Concatenate list of samples
    df_samples = pd.concat(list_samples)
    df_samples.set_index(['sample', 'time'], inplace=True)

    # Output DataFrame of samples
    return df_samples
예제 #2
0
def stationary_boostrap_method(X, Y, block_size=50, n_samples=50):

    boot_samples = []
    bs = StationaryBootstrap(block_size, X, y=Y)

    for samp in bs.bootstrap(n_samples):
        boot_samples.append((samp[0][0], samp[1]['y']))

    return boot_samples
예제 #3
0
def get_TheilSen(_x, what, _nboot, _y):
    import numpy as np
    import pandas as pd
    #the x y are weird, it appears that apply passes the dataframe column as last element
    from arch.bootstrap import StationaryBootstrap, IIDBootstrap
    from scipy.stats import mstats, mannwhitneyu, t, kendalltau
    from statsmodels.distributions.empirical_distribution import ECDF
    try:
        if what=="slope":
            return mstats.theilslopes(np.ma.masked_invalid(_y.values), _x)[0]*86400*365*1000000000
        elif what=="pval_tau":
            return kendalltau(_x, _y)[1]/2
        elif what=="pval_autocorr":            
            res0=mstats.theilslopes(_y, _x, alpha=0.95)[0]
            bs=StationaryBootstrap(3, np.array(range(len(_y))))
            bs_slopes=[]
            for data in bs.bootstrap(_nboot):
                ind=data[0][0]
                res=mstats.theilslopes(_y[ind], _x, alpha=0.95)
                bs_slopes=bs_slopes+[res[0]]
            ecdf=ECDF(bs_slopes)
            pvalue=ecdf(res0)
            if pvalue>0.5:
                pvalue=1-pvalue
#            print pvalue
            return pvalue
        elif what=="pval":
            bs=IIDBootstrap(np.array(range(len(_y))))
            bs_slopes=[]
            for data in bs.bootstrap(_nboot):
                ind=data[0][0]
                res=mstats.theilslopes(_y[ind], _x, alpha=0.95)
                bs_slopes=bs_slopes+[res[0]]
            ecdf=ECDF(bs_slopes)
            pvalue=ecdf(0)
            if pvalue>0.5:
                pvalue=1-pvalue
#            print pvalue
            return pvalue
    except:
        return np.nan
예제 #4
0
    def fit(self, df_portfolios, df_factors):
        """ Fit the estimator

        Parameters
        -----------
        df_portfolios : DataFrame
            Time series of portfolios (test assets)

        df_factors : DataFrame or Series
            Time series of the factors
        """
        tsres, _, loadings = _fmb(df_portfolios, df_factors, self.intercept)
        self._tsres = tsres
        self.loadings = loadings

        if self.alpha is None:
            self.alpha = _get_alpha(self._tsres)

        self._xsres = _penfmb(loadings, self.alpha, self.d, self.tol,
                              self.maxiter)
        self._xsres.name = 'coef'

        sbs = StationaryBootstrap(self.block_length, df_portfolios, df_factors)

        bsxsres = []
        for data in sbs.bootstrap(self.nboot):
            tsres, _, bloadings = _fmb(data[0][0].reset_index(drop=True),
                                       data[0][1].reset_index(drop=True),
                                       self.intercept)
            bsxsres.append(
                _penfmb(bloadings, _get_alpha(tsres), self.d, self.tol,
                        self.maxiter))

        bsxsres = pd.DataFrame(bsxsres)
        self._srate = 1.0 * (bsxsres == 0).sum(axis=0) / bsxsres.shape[0]
        self._srate.name = 'shrinkage rate'
        # self._se = bsxsres.std(axis=0)
        # self._se.name = 'standard error'

        return self
예제 #5
0
파일: core.py 프로젝트: yangxhcaf/ewstools
def block_bootstrap(series,
              n_samples,
              bs_type = 'Stationary',
              block_size = 10
              ):

    '''
    Computes block-bootstrap samples of series.
    
    Args
    ----
    series: pd.Series
        Time-series data in the form of a Pandas Series indexed by time
    n_samples: int
        Number of bootstrapped samples to output.
    bs_type: {'Stationary', 'Circular'}
        Type of block-bootstrapping to perform.
    block_size: int
        Size of resampling blocks. Should be big enough to
        capture important frequencies in the series.
        
    Returns
    -------
    pd.DataFrame:
        DataFrame containing the block-bootstrapped samples of series. 
        Indexed by sample number, then time.
    
    '''

    # Set up list for sampled time-series
    list_samples = []
    
    # Stationary bootstrapping
    if bs_type == 'Stationary':
        bs = StationaryBootstrap(block_size, series)
                
        # Count for sample number
        count = 1
        for data in bs.bootstrap(n_samples):
            
            df_temp = pd.DataFrame({'sample': count, 
                                    'time': series.index.values,
                                    'x': data[0][0]})
            list_samples.append(df_temp)
            count += 1
            
    if bs_type == 'Circular':
        bs = CircularBlockBootstrap(block_size, series)
                
        # Count for sample number
        count = 1
        for data in bs.bootstrap(n_samples):
            
            df_temp = pd.DataFrame({'sample': count, 
                                    'time': series.index.values,
                                    'x': data[0][0]})
            list_samples.append(df_temp)
            count += 1   
    

    # Concatenate list of samples
    df_samples = pd.concat(list_samples)
    df_samples.set_index(['sample','time'], inplace=True)

    
    # Output DataFrame of samples
    return df_samples
예제 #6
0
def trend_CI(x_var, y_var, n_boot=1000, ci=95, trendtype="linreg", q=0.5, frac=0.6, it=3, autocorr=None, CItype="bootstrap"):
    """calculates bootstrap confidence interval and significance level for trend, ignoring autocorrelation or accounting for it
    Parameters
    ----------
    x_var : list
      independent variable
    y_var : list
      dependent variable, same length as x_var
    q : int, optional, only if trendtype==quantreg
      quantile for which regression is to be calculated
    n : int, optional
      number of bootstrap samples
    ci : int, optional
      confidence level. Default is for 95% confidence interval
    frac : int, optional, only if trendtype==lowess
      lowess parameter (fraction of time period length used in local regression)
    it : int, optional, only if trendtype==lowess
      lowess parameter (numbre of iterations)
    autocorr : str, optional
      way of accounting for autocorrelation, possible values: None, "bootstrap"
    trendtype : str, optional
      method of trend derivation, possible values: lowess, linreg, quantreg, TheilSen
    CItype : str, optional
      method of CI derivation, possible values: "analytical" and "bootstrap". 
      if trendtype is "lowess", CItype will be set to None
      if CItype is "analytical": autocorrelation will be set to None
      

    Results
    -------
    returns library with following elements:
    slope - slope of the trend
    CI_high - CI on the slope value
    CI_low - as above
    pvalue - trend's significance level
    trend - trend line, or rather its y values for all x_var
    trendCI_high - confidence interval for each value of y
    trendCI_low - as above

    Remarks
    -------
    the fit function ocassionally crashes on resampled data. The workaround is to use try statement
    """
    import numpy as np
    import pandas as pd
    #for linreg
    import statsmodels.api as sm
    from statsmodels.regression.linear_model import OLS
    #for arima
    import statsmodels.tsa as tsa
    #for quantreg
    import statsmodels.formula.api as smf
    from statsmodels.regression.quantile_regression import QuantReg
    #for lowess
    import statsmodels.nonparametric.api as npsm
    #other
    from statsmodels.distributions.empirical_distribution import ECDF
    from scipy.stats import mstats, mannwhitneyu, t, kendalltau
    from arch.bootstrap import StationaryBootstrap, IIDBootstrap

    #preparing data
    if CItype=="analytical" and trendtype=="TheilSen":
        CItype="bootstrap"
    x_var=np.array(x_var)
    y_var=np.ma.masked_invalid(y_var)
    n_data=len(y_var)
    ci_low=(100-ci)/2
    ci_high=100-ci_low
    
    #setting bootstrapping function
    if autocorr=="bootstrap":
        bs=StationaryBootstrap(3, np.array(range(len(y_var))))
    else:
        bs=IIDBootstrap(np.array(range(len(y_var))))
    
    if trendtype=="quantreg":
        print "Quantile regression, CI type: "+CItype+", autocorrelation adjustment: "+str(autocorr)+"\n"
        xydata=pd.DataFrame(np.column_stack([x_var, y_var]), columns=['X', 'Y'])
        model=smf.quantreg('Y ~ X', xydata)
        res=model.fit(q=q)
        intcpt=res.params.Intercept
        slope=res.params.X
        pvalue=res.pvalues[1]
        CI_low=res.conf_int()[0]['X']
        CI_high=res.conf_int()[1]['X']
        y_pred=res.predict(xydata)
        #calculating residuals
        resids=y_var-y_pred
        #calculate autocorrelation indices
        autocorr_test(x_var, resids)
            
        if CItype=="bootstrap":
            #bootstrapping
            bs_trends=np.copy(y_pred).reshape(-1,1)
            bs_slopes=[]
            bs_intcpts=[]
            for data in bs.bootstrap(n_boot):
                ind=data[0][0]
                model = smf.quantreg('Y ~ X', xydata.ix[ind,:])
                try:
                    res = model.fit(q=q)
                    bs_slopes=bs_slopes+[res.params.X]
                    bs_intcpts=bs_intcpts+[res.params.Intercept]
                    bs_trends=np.append(bs_trends,res.predict(xydata).reshape(-1,1), 1)
                except:
                    goingdownquietly=1
    if trendtype=="linreg":
        print "Linear regression, CI type: "+CItype+", autocorrelation adjustment: "+str(autocorr)+"\n"
        x_varOLS = sm.add_constant(x_var)
        model = sm.OLS(y_var, x_varOLS, hasconst=True, missing='drop')
        res = model.fit()
        intcpt,slope=res.params
        pvalue=res.pvalues[1]
        CI_low,CI_high=res.conf_int()[1]
        y_pred=res.predict(x_varOLS)
        #calculating residuals
        resids=y_var-y_pred
        #calculate autocorrelation indices
        autocorr_test(x_var, resids)
        
        if CItype=="bootstrap":        
            #bootstrapping for confidence intervals
            bs_slopes=[]
            bs_intcpts=[]
            bs_trends=np.copy(y_pred).reshape(-1,1)
            for data in bs.bootstrap(n_boot):
                ind=data[0][0]
                model = sm.OLS(y_var[ind], x_varOLS[ind,:], hasconst=True, missing='drop')
                try:
                    res = model.fit()
                    bs_slopes=bs_slopes+[res.params[1]]
                    bs_intcpts=bs_intcpts+[res.params[0]]
                    bs_trends=np.append(bs_trends,res.predict(x_varOLS).reshape(-1,1), 1)
                except:
                    goingdownquietly=1
                    
    if trendtype=="TheilSen":
#        print "Theil-Sen slope, CI type: "+CItype+", autocorrelation adjustment: "+str(autocorr)+"\n"
        #significance of MK tau
        tau,pvalue=kendalltau(x_var, y_var)
#        print "raw MK tau:", tau, "raw MK pvalue:", pvalue
        #TS slope and confidence intervals
        slope,intercept,CI_low,CI_high=mstats.theilslopes(y_var, x_var, alpha=0.95)        
        #getting slope line's y values
        y_pred=intercept+slope*x_var
        #calculating residuals
        resids=y_var-y_pred
        #calculate autocorrelation indices
        autocorr_test(x_var, resids)
                    
        if CItype=="bootstrap":
            #bootstrapping for confidence intervals
            bs_slopes=[]
            bs_intcpts=[]
            bs_trends=np.copy(y_pred).reshape(-1,1)
            for data in bs.bootstrap(n_boot):
                ind=data[0][0]
                res=mstats.theilslopes(y_var[ind], x_var[ind], alpha=0.95)
                bs_slopes=bs_slopes+[res[0]]
                bs_intcpts=bs_intcpts+[res[1]]
                bs_trends=np.append(bs_trends, (res[1]+res[0]*x_var).reshape(-1,1), 1)

    if trendtype=="lowess":
        print "Lowess\n"
        temp=dict(npsm.lowess(y_var, x_var, frac=frac, it=it, missing="drop"))
        y_pred=np.array(map(temp.get, x_var)).astype("float").reshape(-1,1)
        bs_trends=np.copy(y_pred)
        
        for data in bs.bootstrap(n_boot):
            ind=data[0][0]
            try:
                temp = dict(npsm.lowess(y_var[ind], x_var[ind], frac=frac, it=it, missing="drop"))
                temp=np.array(map(temp.get, x_var)).astype("float").reshape(-1,1)
                pred=pd.DataFrame(temp, index=x_var)
                temp_interp=pred.interpolate().values
                bs_trends=np.append(bs_trends, temp_interp, 1)
            except:
                goingdownquietly=1


    #calculating final values of CI and p-value

    #skipping when lowess
    if trendtype=="lowess":
        CI_low=np.nan
        CI_high=np.nan
        slope=np.nan
        intcpt=np.nan
        pvalue=np.nan
        confint=np.nanpercentile(bs_trends, [ci_low,ci_high], 1)
        trendCI_low=confint[:,0]
        trendCI_high=confint[:,1]
    else:
        if CItype=="bootstrap":
            #values for slope, intercept and trend can be obtained as medians of bootstrap distributions, but normally analytical parameters are used instead
            # it the bootstrap bias (difference between analytical values and bootstap median) is strong, it might be better to use bootstrap values. 
            # These three lines would need to be uncommented then
#            slope=np.median(bs_slopes)
#            intcpt=np.median(bs_intcpts)
#            trend=intcpt+slope*x_var
            #these are from bootstrap too, but needs to be used for this accounts for autocorrelation, which is the point of this script
            CI_low,CI_high=np.percentile(bs_slopes, [5, 95])                
            ecdf=ECDF(bs_slopes)
            pvalue=ecdf(0)
            #this makes sure we are calculating p-value on the correct side of the distribution. That will be one-sided pvalue
            if pvalue>0.5:
                pvalue=1-pvalue
            confint=np.nanpercentile(bs_trends, [ci_low,ci_high], 1)
            print "bs_trends:", bs_trends.shape, confint.shape
            trendCI_low=confint[:,0]
            trendCI_high=confint[:,1]
        else:
            #this is for analytical calculation of trend confidence interval
            #it happens in the same way for each of the trend types, thus it is done here, not under the trendtype subroutines
            #making sure x are floats
            xtemp=np.array(x_var)*1.0
            #squared anomaly
            squanom=(xtemp-np.mean(xtemp))**2
            temp=((1./len(x_var))+(squanom/sum(squanom)))**0.5
            #standard error of estmation
            see=(np.nansum((np.array(y_var)-np.nanmean(y_pred))**2)/len(x_var))**0.5
            #adjusting ci
            ci_adj=1-((1-ci/100.)/2)
            #accounting for uncertainty in mean through student's t
            tcomp=t.ppf(ci_adj, len(x_var)-2)
            #confidence interval
            cint=tcomp*see*temp
            #for trend only
            trendCI_high=y_pred+cint
            trendCI_low=y_pred-cint

        print trendtype, "slope:",slope, "pvalue (one sided):", pvalue, "conf interval:", CI_low, CI_high, "autocorrelation adjustment:", autocorr, "\n"
    output={"slope":slope, "CI_high":CI_high, "CI_low":CI_high, "pvalue":pvalue, "trend": y_pred, "trendCI_low":trendCI_low, "trendCI_high":trendCI_high}
    return output