def stepwise(n_total, n_remains, X, Y): X_feature = X n_iters = len(X_feature.columns) Adjust_Rsquare = [] # Run regression of all variables X = ts.add_constant(X) model = ts.OLS(Y, X, missing='drop') results = model.fit() org_adj_R = results.rsquared_adj # Repeat for step of dropping for j in range(0, n_total - n_remains): reg_score = [] # Drop one variable a time for i in range(0, n_iters): X1 = X_feature.drop(X_feature.columns[i], axis=1) X1 = ts.add_constant(X1) model = ts.OLS(Y, X1, missing='drop') results = model.fit() reg_score.append(results.rsquared_adj) # Select the variables with highest adjusted R^2 selct = reg_score.index(max(reg_score)) Adjust_Rsquare.append(max(reg_score)) X_feature = X_feature.drop(X_feature.columns[selct], axis=1) n_iters = n_iters - 1 Adjust_Rsquare[:0] = [org_adj_R] n_index = Adjust_Rsquare.index(max(Adjust_Rsquare)) remain = n_total - n_index return Adjust_Rsquare, X_feature, remain, n_index
def get_cointLst(corrList, df_is): # called in main # Test cointegration the test has to be perform on both side of the spread cointLst = [] for pair in corrList: X1, X2 = df_is[pair[0]].values, df_is[pair[1]].values x1 = add_constant(X1) x2 = add_constant(X2) r1 = OLS(X2, x1).fit() r2 = OLS(X1, x2).fit() adf1 = adfuller(r1.resid)[1] if adf1 < 0.01: adf2 = adfuller(r2.resid)[1] if adf2 < 0.01 and adf1 < adf2: # Test for strong cointegration in both side only. cointLst.append(["{0}_{1}".format(pair[0], pair[1])] + pair + [adf1] + list(r1.params)) elif adf2 < 0.01: cointLst.append(["{0}_{1}".format(pair[1], pair[0])] + [pair[1], pair[0], pair[2], pair[3], adf2] + list(r2.params)) #print "There are {0} pairs strongly cointegrated.".format(len(cointLst)) return cointLst
def coint_test(): df_list = concat_df() count = 0 for df in df_list: count += 1 print("===========", count, "===========") f1 = adfuller(df['netflow']) f2 = adfuller(df['panic']) # f3 = adfuller(df['pnum']) print(f1, f2) if True: # if not sum([f1, f2]) == 0 or not sum([f1, f2]) == 2: # X = df[['panic', 'pnum']] # 是否需要考虑滞后项? X = df[['panic']] X = st.add_constant(X) y = df['netflow'] coint = st.coint(y, X) print(coint[1]) ols = st.OLS(y, X, missing='drop') res = ols.fit() print(res.summary()) print('Panic pvalue:', res.pvalues['panic']) if st.adfuller(res.resid)[1] < 0.05: print("Steady Residual") else: print("Not Steady!") return 0
def get_half_life(Z): z_lag = np.roll(Z, 1) z_lag[0] = 0 z_ret = Z - z_lag # adds intercept terms to X for regression z_lag2 = add_constant(z_lag) model = OLS(z_ret, z_lag2).fit() return int(-np.log(2) / model.params[1])
def AR_p(self, x, p=1, method='ols'): lagged_data = self.add_lag(x, lag=p).copy() lagged_data = add_constant(lagged_data) ls_x = [] for i in range(1, p+1): ls_x.append('lag_'+str(i)+'_'+str(x)) ls_x.append('const') self.lagged_data = lagged_data if method == 'ols': model = sm.OLS(endog=lagged_data.loc[:, x], exog=lagged_data.loc[:, ls_x]).fit() elif method == 'stats': # TODO check source code and algorithm model = ARMA(self.data.loc[:, x], order=(p, 0)).fit() # print(model.summary()) return model
def get_half_life_from_scratch(stockX, stockY, beta, df_is): # called in get_df_coint z_array = get_z(stockX, stockY, beta, df_is) z_lag = np.roll(z_array, 1) z_lag[0] = 0 z_ret = z_array - z_lag # adds intercept terms to X for regression z_lag2 = add_constant(z_lag) model = OLS(z_ret, z_lag2) res = model.fit() return int(-np.log(2) / res.params[1])
import numpy as np import pandas as pd import matplotlib.pyplot as plt #import statsmodels.formula.api as sm import statsmodels.tsa.stattools as ts #import statsmodels.tsa.vector_ar.vecm as vm df=pd.read_csv('inputData_EWA_EWC.csv') df['Date']=pd.to_datetime(df['Date'], format='%Y%m%d').dt.date # remove HH:MM:SS df.set_index('Date', inplace=True) x=df['EWA'] y=df['EWC'] x=np.array(ts.add_constant(x))[:, [1,0]] # Augment x with ones to accomodate possible offset in the regression between y vs x. delta=0.0001 # delta=1 gives fastest change in beta, delta=0.000....1 allows no change (like traditional linear regression). yhat=np.full(y.shape[0], np.nan) # measurement prediction e=yhat.copy() Q=yhat.copy() # For clarity, we denote R(t|t) by P(t). Initialize R, P and beta. R=np.zeros((2,2)) P=R.copy() beta=np.full((2, x.shape[0]), np.nan) Vw=delta/(1-delta)*np.eye(2) Ve=0.001 # Initialize beta(:, 1) to zero
def fit_cointegration_models(self, base_data): """ This function runs cointegration tests on the entire set of tickers in our dictionary and returns the residual sets of coitnegrated series OLS fit... basically an identifier for statistical arbitrage :param base_data: data frame, the base table for the stock :return cointegration_residuals: the residual set, displaying arbitrage opportunities """ # set empty containers to hold results of cointegration tests r_square_container = [] cointegration_set = [] cointegration_fit_container = [] log_start = pd.to_datetime('today') X = base_data[['DATE', 'CLOSE']] # initialize loop to test each price_data for cointegration, drop stock when finished to reduce runs # and redundancy of creating pairwise relationships several times. Stop when all stocks are dropped # loop through all potential relationships, creating coint tests coint_test_keys = list(self.stock_dict.keys()) coint_test_keys.remove(self.stock) for every in coint_test_keys: try: y = self.stock_dict[str(every)][['DATE', 'CLOSE']] # We only use stocks that have at least the length of data of our focus subject merge = pd.merge(X, y, how='inner', left_on='DATE', right_on='DATE') if len(merge) != len(X): print( 'Lost some data on merge, rejecting cointegration test' ) continue one = merge['CLOSE_x'] two = merge['CLOSE_y'] ci = sm.coint(one, two, trend='ct', maxlag=0) t = ci[1] # if t-score is beyond confidence threshold, regress the two stocks if t <= .05: # print(str(every)+" is cointegrated") two = sm.add_constant(two) mod = OLS(one, two).fit() se = mod.ssr**(1 / 2) r_square = mod.rsquared_adj if r_square < 0.70: continue cointegration_set.append(every) r_square_container.append(r_square) fitted_values = mod.fittedvalues.to_list() cointegration_fit_container.append(fitted_values) print('{} and {} are indicated as cointegrated'.format( every, self.stock)) except: print("cointegration test for " + str(every) + " failed") if cointegration_fit_container: weighted_r_squared = np.divide(r_square_container, sum(r_square_container)) cointegration_fit = sum( np.multiply( np.array(cointegration_fit_container).T, weighted_r_squared).T) cointegration_residuals = (X['CLOSE'] - cointegration_fit) else: cointegration_residuals = [0] * len(base_data) log_end = pd.to_datetime('today') print('Cointegration function began running at ' + str(log_start) + ' and ended at ' + str(log_end)) return cointegration_residuals, cointegration_set
# Trading Price Spread import numpy as np import pandas as pd #import matplotlib.pyplot as plt import statsmodels.formula.api as sm import statsmodels.tsa.stattools as ts #import statsmodels.tsa.vector_ar.vecm as vm df=pd.read_csv('inputData_GLD_USO.csv') df['Date']=pd.to_datetime(df['Date'], format='%Y%m%d').dt.date # remove HH:MM:SS df.set_index('Date', inplace=True) lookback=20 hedgeRatio=np.full(df.shape[0], np.nan) for t in np.arange(lookback, len(hedgeRatio)): regress_results=sm.ols(formula="USO ~ GLD", data=df[(t-lookback):t]).fit() # Note this can deal with NaN in top row hedgeRatio[t-1]=regress_results.params[1] yport=np.sum(ts.add_constant(-hedgeRatio)[:, [1,0]]*df, axis=1) yport.plot() # Apply a simple linear mean reversion strategy to GLD-USO numUnits =-(yport-yport.rolling(lookback).mean())/yport.rolling(lookback).std() # capital invested in portfolio in dollars. movingAvg and movingStd are functions from epchan.com/book2 positions=pd.DataFrame(np.tile(numUnits.values, [2, 1]).T * ts.add_constant(-hedgeRatio)[:, [1,0]] *df.values) # results.evec(:, 1)' can be viewed as the capital allocation, while positions is the dollar capital in each ETF. pnl=np.sum((positions.shift().values)*(df.pct_change().values), axis=1) # daily P&L of the strategy ret=pnl/np.sum(np.abs(positions.shift()), axis=1) (np.cumprod(1+ret)-1).plot() print('APR=%f Sharpe=%f' % (np.prod(1+ret)**(252/len(ret))-1, np.sqrt(252)*np.mean(ret)/np.std(ret)))
def OLS(Y, X): X = ts.add_constant(X) model = ts.OLS(Y, X, missing='drop') results = model.fit() print(results.summary())
def OLSBeta(Y, X): X = ts.add_constant(X) model = ts.OLS(Y, X, missing='drop') results = model.fit() return (results.params)
import numpy as np import pandas as pd import matplotlib.pyplot as plt #import statsmodels.formula.api as sm import statsmodels.tsa.stattools as ts #import statsmodels.tsa.vector_ar.vecm as vm df = pd.read_csv('inputData_EWA_EWC.csv') df['Date'] = pd.to_datetime(df['Date'], format='%Y%m%d').dt.date # remove HH:MM:SS df.set_index('Date', inplace=True) x = df['EWA'] y = df['EWC'] x = np.array(ts.add_constant(x))[:, [ 1, 0 ]] # Augment x with ones to accomodate possible offset in the regression between y vs x. delta = 0.0001 # delta=1 gives fastest change in beta, delta=0.000....1 allows no change (like traditional linear regression). yhat = np.full(y.shape[0], np.nan) # measurement prediction e = yhat.copy() Q = yhat.copy() # For clarity, we denote R(t|t) by P(t). Initialize R, P and beta. R = np.zeros((2, 2)) P = R.copy() beta = np.full((2, x.shape[0]), np.nan) Vw = delta / (1 - delta) * np.eye(2) Ve = 0.001
def predict(self, x: np.array, y: np.array): """ Measurement equation: y[t] = b[t]*x[t] + e[t] where: e ~ N(0, Ve) b is a [Nx2] array <- intercept and slope y is the observable variable/state x is the observation model b is the hidden variable/state --- 1) State extrapolation: yhat[t+1] = x[t+1] * bhat[t+1|t] bhat[t+1|t] = bhat[t|t] <- constant dynamic 2) State Covariance extrapolation: Qhat[t+1|t] = x[t+1] * Rhat[t|t] * x[t+1]' + Ve <-- Qhat is the variance of e[t] Rhat[t+1|t] = Rhat[t|t] + Vw <-- Rhat is the measurement uncertainty where: Ve and Vw are gaussian noise 3) Kalman gain: K[t] = Rhat[t|t-1]*x[t] / Qhat[t] 4) Hidden State update: bhat[t|t] = bhat[t|t-1] + K[t]*( y[t] - x[t]*bhat[t|t-1] ) 5) Hidden State covariance update: Rhat[t|t] = R[t|t-1] - K[t]*x[t]*Rhat[t|t-1] """ x = np.array(ts.add_constant(x))[:, [ 1, 0 ]] # Augment x with ones to accomodate possible offset in the regression between y vs x. # Initialize yhat, bhat, Qhat and Rhat. yhat = np.empty(y.shape[0]) # measurement predictions Qhat = yhat.copy() # measurement predictions error variance bhat = np.empty((x.shape[1], x.shape[0])) bhat[:, 0] = 0 # Initialize beta(:, 1) to zero Rhat = np.zeros((x.shape[1], x.shape[1])) Vw = self.delta / (1 - self.delta) * np.eye(x.shape[1]) Ve = 0.001 for t in range(len(y)): if t > 0: # Hidden state extrapolation bhat[:, t] = bhat[:, t - 1] # Hidden state covariance extrapolation Rhat = Rhat + Vw # Observable State extrapolation yhat[t] = np.dot(x[t, :], bhat[:, t]) # Observable State variance extrapolation Qhat[t] = np.dot(np.dot(x[t, :], Rhat), x[t, :].T) + Ve # Kalman gain K = np.dot(Rhat, x[t, :].T) / Qhat[t] # Hidden state update bhat[:, t] = bhat[:, t] + np.dot(K, y[t] - yhat[t]) # Hidden state covariance update Rhat = Rhat - np.dot(np.dot(K, x[t, :]), Rhat) return yhat, bhat, Qhat
df = pd.read_csv('inputData_GLD_USO.csv') df['Date'] = pd.to_datetime(df['Date'], format='%Y%m%d').dt.date # remove HH:MM:SS df.set_index('Date', inplace=True) lookback = 20 hedgeRatio = np.full(df.shape[0], np.nan) for t in np.arange(lookback, len(hedgeRatio)): regress_results = sm.ols( formula="USO ~ GLD", data=np.log( df[(t - lookback):t])).fit() # Note this can deal with NaN in top row hedgeRatio[t - 1] = regress_results.params[1] yport = np.sum(ts.add_constant(-hedgeRatio)[:, [1, 0]] * np.log(df), axis=1) yport.plot() # Apply a simple linear mean reversion strategy to GLD-USO numUnits = -(yport - yport.rolling(lookback).mean( )) / yport.rolling(lookback).std( ) # capital invested in portfolio in dollars. movingAvg and movingStd are functions from epchan.com/book2 positions = pd.DataFrame( np.tile(numUnits.values, [2, 1]).T * ts.add_constant(-hedgeRatio)[:, [1, 0]] ) # positions is the dollar capital in each ETF. pnl = np.sum((positions.shift().values) * (df.pct_change().values), axis=1) # daily P&L of the strategy ret = pnl / np.sum(np.abs(positions.shift()), axis=1) (np.cumprod(1 + ret) - 1).plot() print('APR=%f Sharpe=%f' % (np.prod(1 + ret)**(252 / len(ret)) - 1,
df = pd.read_csv('inputData_GLD_USO.csv') df['Date'] = pd.to_datetime(df['Date'], format='%Y%m%d').dt.date # remove HH:MM:SS df.set_index('Date', inplace=True) lookback = 20 hedgeRatio = np.full(df.shape[0], np.nan) for t in np.arange(lookback, len(hedgeRatio)): regress_results = sm.ols( formula="USO ~ GLD", data=df[(t - lookback):t]).fit() # Note this can deal with NaN in top row hedgeRatio[t - 1] = regress_results.params[1] yport = np.sum(ts.add_constant(-hedgeRatio)[:, [1, 0]] * df, axis=1) yport.plot() # Bollinger band strategy entryZscore = 1 exitZscore = 0 MA = yport.rolling(lookback).mean() MSTD = yport.rolling(lookback).std() zScore = (yport - MA) / MSTD longsEntry = zScore < -entryZscore longsExit = zScore > -entryZscore shortsEntry = zScore > entryZscore shortsExit = zScore < exitZscore