Ejemplo n.º 1
0
def stepwise(n_total, n_remains, X, Y):
    X_feature = X
    n_iters = len(X_feature.columns)
    Adjust_Rsquare = []
    # Run regression of all variables
    X = ts.add_constant(X)
    model = ts.OLS(Y, X, missing='drop')
    results = model.fit()
    org_adj_R = results.rsquared_adj
    # Repeat for step of dropping
    for j in range(0, n_total - n_remains):
        reg_score = []
        # Drop one variable a time
        for i in range(0, n_iters):
            X1 = X_feature.drop(X_feature.columns[i], axis=1)
            X1 = ts.add_constant(X1)
            model = ts.OLS(Y, X1, missing='drop')
            results = model.fit()
            reg_score.append(results.rsquared_adj)
        # Select the variables with highest adjusted R^2
        selct = reg_score.index(max(reg_score))
        Adjust_Rsquare.append(max(reg_score))
        X_feature = X_feature.drop(X_feature.columns[selct], axis=1)
        n_iters = n_iters - 1
    Adjust_Rsquare[:0] = [org_adj_R]
    n_index = Adjust_Rsquare.index(max(Adjust_Rsquare))
    remain = n_total - n_index
    return Adjust_Rsquare, X_feature, remain, n_index
Ejemplo n.º 2
0
def get_cointLst(corrList, df_is):
    # called in main
    # Test cointegration the test has to be perform on both side of the spread
    cointLst = []
    for pair in corrList:
        X1, X2 = df_is[pair[0]].values, df_is[pair[1]].values

        x1 = add_constant(X1)
        x2 = add_constant(X2)
        r1 = OLS(X2, x1).fit()
        r2 = OLS(X1, x2).fit()

        adf1 = adfuller(r1.resid)[1]
        if adf1 < 0.01:
            adf2 = adfuller(r2.resid)[1]
            if adf2 < 0.01 and adf1 < adf2:  # Test for strong cointegration in both side only.
                cointLst.append(["{0}_{1}".format(pair[0], pair[1])] + pair +
                                [adf1] + list(r1.params))
            elif adf2 < 0.01:
                cointLst.append(["{0}_{1}".format(pair[1], pair[0])] +
                                [pair[1], pair[0], pair[2], pair[3], adf2] +
                                list(r2.params))

    #print "There are {0} pairs strongly cointegrated.".format(len(cointLst))
    return cointLst
Ejemplo n.º 3
0
def coint_test():
    df_list = concat_df()
    count = 0
    for df in df_list:
        count += 1
        print("===========", count, "===========")
        f1 = adfuller(df['netflow'])
        f2 = adfuller(df['panic'])
        # f3 = adfuller(df['pnum'])
        print(f1, f2)
        if True:
            # if not sum([f1, f2]) == 0 or not sum([f1, f2]) == 2:
            # X = df[['panic', 'pnum']]  # 是否需要考虑滞后项?
            X = df[['panic']]
            X = st.add_constant(X)
            y = df['netflow']
            coint = st.coint(y, X)
            print(coint[1])
            ols = st.OLS(y, X, missing='drop')
            res = ols.fit()
            print(res.summary())
            print('Panic pvalue:', res.pvalues['panic'])
            if st.adfuller(res.resid)[1] < 0.05:
                print("Steady Residual")
            else:
                print("Not Steady!")

    return 0
Ejemplo n.º 4
0
def get_half_life(Z):
    z_lag = np.roll(Z, 1)
    z_lag[0] = 0
    z_ret = Z - z_lag

    # adds intercept terms to X for regression
    z_lag2 = add_constant(z_lag)
    model = OLS(z_ret, z_lag2).fit()

    return int(-np.log(2) / model.params[1])
Ejemplo n.º 5
0
 def AR_p(self, x, p=1, method='ols'):
     lagged_data = self.add_lag(x, lag=p).copy()
     lagged_data = add_constant(lagged_data)
     ls_x = []
     for i in range(1, p+1):
         ls_x.append('lag_'+str(i)+'_'+str(x))
     ls_x.append('const')
     self.lagged_data = lagged_data
     if method == 'ols':
         model = sm.OLS(endog=lagged_data.loc[:, x], exog=lagged_data.loc[:, ls_x]).fit()
     elif method == 'stats': # TODO check source code and algorithm
         model = ARMA(self.data.loc[:, x], order=(p, 0)).fit()
     # print(model.summary())
     return model
Ejemplo n.º 6
0
def get_half_life_from_scratch(stockX, stockY, beta, df_is):
    # called in get_df_coint
    z_array = get_z(stockX, stockY, beta, df_is)

    z_lag = np.roll(z_array, 1)
    z_lag[0] = 0
    z_ret = z_array - z_lag

    # adds intercept terms to X for regression
    z_lag2 = add_constant(z_lag)
    model = OLS(z_ret, z_lag2)
    res = model.fit()

    return int(-np.log(2) / res.params[1])
Ejemplo n.º 7
0
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
#import statsmodels.formula.api as sm
import statsmodels.tsa.stattools as ts
#import statsmodels.tsa.vector_ar.vecm as vm

df=pd.read_csv('inputData_EWA_EWC.csv')
df['Date']=pd.to_datetime(df['Date'],  format='%Y%m%d').dt.date # remove HH:MM:SS
df.set_index('Date', inplace=True)

x=df['EWA']
y=df['EWC']

x=np.array(ts.add_constant(x))[:, [1,0]] # Augment x with ones to  accomodate possible offset in the regression between y vs x.

delta=0.0001 # delta=1 gives fastest change in beta, delta=0.000....1 allows no change (like traditional linear regression).

yhat=np.full(y.shape[0], np.nan) # measurement prediction
e=yhat.copy()
Q=yhat.copy()

# For clarity, we denote R(t|t) by P(t). Initialize R, P and beta.
R=np.zeros((2,2))
P=R.copy()
beta=np.full((2, x.shape[0]), np.nan)
Vw=delta/(1-delta)*np.eye(2)
Ve=0.001

# Initialize beta(:, 1) to zero
    def fit_cointegration_models(self, base_data):
        """
        This function runs cointegration tests on the entire set of tickers in our dictionary and returns the
        residual sets of coitnegrated series OLS fit... basically an identifier for statistical arbitrage

        :param base_data: data frame, the base table for the stock

        :return cointegration_residuals: the residual set, displaying arbitrage opportunities
        """
        # set empty containers to hold results of cointegration tests
        r_square_container = []
        cointegration_set = []
        cointegration_fit_container = []

        log_start = pd.to_datetime('today')
        X = base_data[['DATE', 'CLOSE']]

        # initialize loop to test each price_data for cointegration, drop stock when finished to reduce runs
        # and redundancy of creating pairwise relationships several times. Stop when all stocks are dropped
        # loop through all potential relationships, creating coint tests
        coint_test_keys = list(self.stock_dict.keys())
        coint_test_keys.remove(self.stock)
        for every in coint_test_keys:
            try:
                y = self.stock_dict[str(every)][['DATE', 'CLOSE']]
                # We only use stocks that have at least the length of data of our focus subject
                merge = pd.merge(X,
                                 y,
                                 how='inner',
                                 left_on='DATE',
                                 right_on='DATE')
                if len(merge) != len(X):
                    print(
                        'Lost some data on merge, rejecting cointegration test'
                    )
                    continue
                one = merge['CLOSE_x']
                two = merge['CLOSE_y']
                ci = sm.coint(one, two, trend='ct', maxlag=0)
                t = ci[1]
                # if t-score is beyond confidence threshold, regress the two stocks
                if t <= .05:
                    # print(str(every)+" is cointegrated")
                    two = sm.add_constant(two)
                    mod = OLS(one, two).fit()
                    se = mod.ssr**(1 / 2)
                    r_square = mod.rsquared_adj
                    if r_square < 0.70:
                        continue
                    cointegration_set.append(every)
                    r_square_container.append(r_square)
                    fitted_values = mod.fittedvalues.to_list()
                    cointegration_fit_container.append(fitted_values)
                    print('{} and {} are indicated as cointegrated'.format(
                        every, self.stock))
            except:
                print("cointegration test for " + str(every) + " failed")

        if cointegration_fit_container:
            weighted_r_squared = np.divide(r_square_container,
                                           sum(r_square_container))
            cointegration_fit = sum(
                np.multiply(
                    np.array(cointegration_fit_container).T,
                    weighted_r_squared).T)
            cointegration_residuals = (X['CLOSE'] - cointegration_fit)

        else:
            cointegration_residuals = [0] * len(base_data)

        log_end = pd.to_datetime('today')
        print('Cointegration function began running at ' + str(log_start) +
              ' and ended at ' + str(log_end))
        return cointegration_residuals, cointegration_set
Ejemplo n.º 9
0
# Trading Price Spread

import numpy as np
import pandas as pd
#import matplotlib.pyplot as plt
import statsmodels.formula.api as sm
import statsmodels.tsa.stattools as ts
#import statsmodels.tsa.vector_ar.vecm as vm

df=pd.read_csv('inputData_GLD_USO.csv')
df['Date']=pd.to_datetime(df['Date'],  format='%Y%m%d').dt.date # remove HH:MM:SS
df.set_index('Date', inplace=True)

lookback=20
hedgeRatio=np.full(df.shape[0], np.nan)
for t in np.arange(lookback, len(hedgeRatio)):
    regress_results=sm.ols(formula="USO ~ GLD", data=df[(t-lookback):t]).fit() # Note this can deal with NaN in top row
    hedgeRatio[t-1]=regress_results.params[1]

yport=np.sum(ts.add_constant(-hedgeRatio)[:, [1,0]]*df, axis=1)
yport.plot()

# Apply a simple linear mean reversion strategy to GLD-USO
numUnits =-(yport-yport.rolling(lookback).mean())/yport.rolling(lookback).std() # capital invested in portfolio in dollars.  movingAvg and movingStd are functions from epchan.com/book2
positions=pd.DataFrame(np.tile(numUnits.values, [2, 1]).T * ts.add_constant(-hedgeRatio)[:, [1,0]] *df.values) # results.evec(:, 1)' can be viewed as the capital allocation, while positions is the dollar capital in each ETF.
pnl=np.sum((positions.shift().values)*(df.pct_change().values), axis=1) # daily P&L of the strategy
ret=pnl/np.sum(np.abs(positions.shift()), axis=1)
(np.cumprod(1+ret)-1).plot()
print('APR=%f Sharpe=%f' % (np.prod(1+ret)**(252/len(ret))-1, np.sqrt(252)*np.mean(ret)/np.std(ret)))
Ejemplo n.º 10
0
def OLS(Y, X):
    X = ts.add_constant(X)
    model = ts.OLS(Y, X, missing='drop')
    results = model.fit()
    print(results.summary())
Ejemplo n.º 11
0
def OLSBeta(Y, X):
    X = ts.add_constant(X)
    model = ts.OLS(Y, X, missing='drop')
    results = model.fit()
    return (results.params)
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
#import statsmodels.formula.api as sm
import statsmodels.tsa.stattools as ts
#import statsmodels.tsa.vector_ar.vecm as vm

df = pd.read_csv('inputData_EWA_EWC.csv')
df['Date'] = pd.to_datetime(df['Date'],
                            format='%Y%m%d').dt.date  # remove HH:MM:SS
df.set_index('Date', inplace=True)

x = df['EWA']
y = df['EWC']

x = np.array(ts.add_constant(x))[:, [
    1, 0
]]  # Augment x with ones to  accomodate possible offset in the regression between y vs x.

delta = 0.0001  # delta=1 gives fastest change in beta, delta=0.000....1 allows no change (like traditional linear regression).

yhat = np.full(y.shape[0], np.nan)  # measurement prediction
e = yhat.copy()
Q = yhat.copy()

# For clarity, we denote R(t|t) by P(t). Initialize R, P and beta.
R = np.zeros((2, 2))
P = R.copy()
beta = np.full((2, x.shape[0]), np.nan)
Vw = delta / (1 - delta) * np.eye(2)
Ve = 0.001
Ejemplo n.º 13
0
    def predict(self, x: np.array, y: np.array):
        """ 
        Measurement equation:
            y[t] = b[t]*x[t] + e[t] 
        where:
            e ~ N(0, Ve)
            b is a [Nx2] array <- intercept and slope
        
        y is the observable variable/state
        x is the observation model
        b is the hidden variable/state
        ---
        
        1)  State extrapolation: 
            
            yhat[t+1] = x[t+1] * bhat[t+1|t] 
            bhat[t+1|t] = bhat[t|t]  <- constant dynamic
        
        2) State Covariance extrapolation: 
            
            Qhat[t+1|t] = x[t+1] * Rhat[t|t] * x[t+1]' + Ve  <-- Qhat is the variance of e[t]
            Rhat[t+1|t] = Rhat[t|t] + Vw  <-- Rhat is the measurement uncertainty

            where: Ve and Vw are gaussian noise
         
        3) Kalman gain:
            K[t] = Rhat[t|t-1]*x[t] / Qhat[t]
        
        4) Hidden State update:
            bhat[t|t] = bhat[t|t-1] + K[t]*( y[t] - x[t]*bhat[t|t-1] )
        
        5) Hidden State covariance update:
            Rhat[t|t] = R[t|t-1] - K[t]*x[t]*Rhat[t|t-1]
        
            """
        x = np.array(ts.add_constant(x))[:, [
            1, 0
        ]]  # Augment x with ones to  accomodate possible offset in the regression between y vs x.

        # Initialize yhat, bhat, Qhat and Rhat.
        yhat = np.empty(y.shape[0])  # measurement predictions
        Qhat = yhat.copy()  # measurement predictions error variance

        bhat = np.empty((x.shape[1], x.shape[0]))
        bhat[:, 0] = 0  # Initialize beta(:, 1) to zero
        Rhat = np.zeros((x.shape[1], x.shape[1]))

        Vw = self.delta / (1 - self.delta) * np.eye(x.shape[1])
        Ve = 0.001

        for t in range(len(y)):
            if t > 0:
                # Hidden state extrapolation
                bhat[:, t] = bhat[:, t - 1]
                # Hidden state covariance extrapolation
                Rhat = Rhat + Vw

            # Observable State extrapolation
            yhat[t] = np.dot(x[t, :], bhat[:, t])

            # Observable State variance extrapolation
            Qhat[t] = np.dot(np.dot(x[t, :], Rhat), x[t, :].T) + Ve

            # Kalman gain
            K = np.dot(Rhat, x[t, :].T) / Qhat[t]

            # Hidden state update
            bhat[:, t] = bhat[:, t] + np.dot(K, y[t] - yhat[t])

            # Hidden state covariance update
            Rhat = Rhat - np.dot(np.dot(K, x[t, :]), Rhat)

        return yhat, bhat, Qhat
Ejemplo n.º 14
0
df = pd.read_csv('inputData_GLD_USO.csv')
df['Date'] = pd.to_datetime(df['Date'],
                            format='%Y%m%d').dt.date  # remove HH:MM:SS
df.set_index('Date', inplace=True)

lookback = 20
hedgeRatio = np.full(df.shape[0], np.nan)
for t in np.arange(lookback, len(hedgeRatio)):
    regress_results = sm.ols(
        formula="USO ~ GLD", data=np.log(
            df[(t -
                lookback):t])).fit()  # Note this can deal with NaN in top row
    hedgeRatio[t - 1] = regress_results.params[1]

yport = np.sum(ts.add_constant(-hedgeRatio)[:, [1, 0]] * np.log(df), axis=1)
yport.plot()

# Apply a simple linear mean reversion strategy to GLD-USO
numUnits = -(yport - yport.rolling(lookback).mean(
)) / yport.rolling(lookback).std(
)  # capital invested in portfolio in dollars.  movingAvg and movingStd are functions from epchan.com/book2
positions = pd.DataFrame(
    np.tile(numUnits.values, [2, 1]).T *
    ts.add_constant(-hedgeRatio)[:, [1, 0]]
)  #  positions is the dollar capital in each ETF.
pnl = np.sum((positions.shift().values) * (df.pct_change().values),
             axis=1)  # daily P&L of the strategy
ret = pnl / np.sum(np.abs(positions.shift()), axis=1)
(np.cumprod(1 + ret) - 1).plot()
print('APR=%f Sharpe=%f' % (np.prod(1 + ret)**(252 / len(ret)) - 1,
Ejemplo n.º 15
0
df = pd.read_csv('inputData_GLD_USO.csv')
df['Date'] = pd.to_datetime(df['Date'],
                            format='%Y%m%d').dt.date  # remove HH:MM:SS
df.set_index('Date', inplace=True)

lookback = 20
hedgeRatio = np.full(df.shape[0], np.nan)
for t in np.arange(lookback, len(hedgeRatio)):
    regress_results = sm.ols(
        formula="USO ~ GLD",
        data=df[(t -
                 lookback):t]).fit()  # Note this can deal with NaN in top row
    hedgeRatio[t - 1] = regress_results.params[1]

yport = np.sum(ts.add_constant(-hedgeRatio)[:, [1, 0]] * df, axis=1)
yport.plot()

# Bollinger band strategy
entryZscore = 1
exitZscore = 0

MA = yport.rolling(lookback).mean()
MSTD = yport.rolling(lookback).std()
zScore = (yport - MA) / MSTD

longsEntry = zScore < -entryZscore
longsExit = zScore > -entryZscore

shortsEntry = zScore > entryZscore
shortsExit = zScore < exitZscore