def gen_randdata(n=2): # Generate random data for correlation and cointegration tests. # First, cointegrated and correlated X_returns = np.random.normal(0, 1, 100) # Generate the daily returns # sum them and shift all the prices up into a reasonable range X = pd.Series(np.cumsum(X_returns), name='X') + 50 some_noise = np.random.normal(0, 1, 100) Y = X + 5 + some_noise Y.name = 'Y' fig = plt.figure() ax1 = fig.add_subplot(321) ax1.plot(pd.concat([X, Y], axis=1)) ax2 = fig.add_subplot(322) ax2.plot(Y-X) ax2.axhline((Y-X).mean(), color='red', linestyle='--') score, pvalue, _ = coint(X,Y) ax1.set_title('Correlation: %.5f' %X.corr(Y)) ax2.set_title('Cointegration P: %.5f' %pvalue) # Next, correlated but not cointegrated X_returns = np.random.normal(1, 1, 100) Y_returns = np.random.normal(2, 1, 100) X_diverging = pd.Series(np.cumsum(X_returns), name='X_diverging') Y_diverging = pd.Series(np.cumsum(Y_returns), name='Y_diverging') ax3 = fig.add_subplot(323) ax4 = fig.add_subplot(324) ax3.plot(pd.concat([X_diverging, Y_diverging], axis=1)) ax4.plot(Y_diverging-X_diverging) ax4.axhline((Y_diverging-X_diverging).mean(), color='red', linestyle='--') score, pvalue, _ = coint(X_diverging, Y_diverging) ax3.set_title('Correlation: %.5f' %X_diverging.corr(Y_diverging)) ax4.set_title('Cointegration P: %.5f' %pvalue) # cointegration without correlation ("nominal convergence") X_coint = pd.Series(np.random.normal(0, 1, 1000), name='X_coint') + 20 Y_coint = X_coint.copy() for i in range(10): if i % 2 == 0: Y_coint[i*100:(i+1)*100] = 10 else: Y_coint[i*100:(i+1)*100] = 30 ax5 = fig.add_subplot(325) ax5.plot(X_coint) ax5.plot(Y_coint) ax5.set_ylim([0,40]) ax6 = fig.add_subplot(326) ax6.plot(Y_coint-X_coint) ax6.axhline((Y_coint-X_coint).mean(), color='red', linestyle='--') score, pvalue, _ = coint(X_coint, Y_coint) ax5.set_title('Correlation: %.5f' %X_coint.corr(Y_coint)) ax6.set_title('Cointegration P: %.5f' %pvalue) plt.show() return
def simple_comparison(X,Y, show=True): """ Compares correlation and cointegration for X and Y. Can be Y! dataframe or n x 1 arrays. """ if type(X) is pd.DataFrame: x = X['Adj Close'] else: x = X if type(Y) is pd.DataFrame: y = Y['Adj Close'] else: y = Y #x = pd.Series(np.cumsum(x), name='x') #y = pd.Series(np.cumsum(y), name='y') x = pd.Series(x, name='x') y = pd.Series(y, name='y') x, y = return_timelocked(x,y) fig = plt.figure() ax1 = fig.add_subplot(121) ax1.plot(pd.concat([x,y], axis=1)) ax2 = fig.add_subplot(122) ax2.plot(y-x) ax2.axhline((y-x).mean(), color='red', linestyle='--') score, pvalue, _ = coint(x,y) ax1.set_title('Correlation: %.5f' %x.corr(y)) ax2.set_title('Cointegration P: %.5f' %pvalue) if show: plt.show() return return ax1, ax2
def test_coint(pair): # Using Augmented Dickey-Fuller unit root test (from Brett's code) #result = sm.OLS(pair[1], pair[0]).fit() #dfResult = ts.adfuller(result.resid) #return dfResult[0] >= dfResult[4]['10%'] # Using cointegration tets built into statsmodels result = ts.coint(pair[1], pair[0]) return result[0] >= result[2][2]
def test_coint_identical_series(): nobs = 200 scale_e = 1 np.random.seed(123) y = scale_e * np.random.randn(nobs) warnings.simplefilter('always', CollinearityWarning) with pytest.warns(CollinearityWarning): c = coint(y, y, trend="c", maxlag=0, autolag=None) assert_equal(c[1], 0.0) assert_(np.isneginf(c[0]))
def test_coint_identical_series(): nobs = 200 scale_e = 1 np.random.seed(123) y = scale_e * np.random.randn(nobs) warnings.simplefilter('always', ColinearityWarning) with warnings.catch_warnings(record=True) as w: c = coint(y, y, trend="c", maxlag=0, autolag=None) assert_equal(len(w), 1) assert_equal(c[1], 0.0) assert_(np.isneginf(c[0]))
def test_coint_perfect_collinearity(): nobs = 200 scale_e = 1 np.random.seed(123) x = scale_e * np.random.randn(nobs, 2) y = 1 + x.sum(axis=1) warnings.simplefilter('always', ColinearityWarning) with warnings.catch_warnings(record=True) as w: c = coint(y, x, trend="c", maxlag=0, autolag=None) assert_equal(c[0], 0.0) # Limit of table assert_(c[1] > .98)
def test_coint_perfect_collinearity(): # test uses nearly perfect collinearity nobs = 200 scale_e = 1 np.random.seed(123) x = scale_e * np.random.randn(nobs, 2) y = 1 + x.sum(axis=1) + 1e-7 * np.random.randn(nobs) warnings.simplefilter('always', CollinearityWarning) with warnings.catch_warnings(record=True) as w: c = coint(y, x, trend="c", maxlag=0, autolag=None) assert_equal(c[1], 0.0) assert_(np.isneginf(c[0]))
def get_pairs(train): s = train.corr().abs().unstack().drop_duplicates() corr = s.sort_values() corr.to_csv('~/Desktop/MAFN/Hedge Fund/presentation2/correlation.csv') pairs = list(corr.index)[-200:-1] ans = [] for pair in pairs: _, p_value, _ = coint(df[[pair[0]]], df[[pair[1]]]) if p_value < 0.05: ans.append(pair) selected = corr[ans] selected.to_csv('highcorrelationandcointegrated.csv', header = None) return list(selected.index)
def find_cointegrated_pairs(securities_panel): n = len(securities_panel.minor_axis) score_matrix = np.zeros((n, n)) pvalue_matrix = np.ones((n, n)) keys = securities_panel.keys pairs = [] for i in range(n): for j in range(i+1, n): S1 = securities_panel.minor_xs(securities_panel.minor_axis[i]) S2 = securities_panel.minor_xs(securities_panel.minor_axis[j]) result = coint(S1, S2) score = result[0] pvalue = result[1] score_matrix[i, j] = score pvalue_matrix[i, j] = pvalue if pvalue < 0.05: pairs.append((securities_panel.minor_axis[i], securities_panel.minor_axis[j])) return score_matrix, pvalue_matrix, pairs
def find_cointegrated_pairs(stock_data, significance=0.05): # ------------------ # This function is from, # https://www.quantopian.com/lectures/introduction-to-pairs-trading. # We also sort the pairs in ascending order by p-value n = stock_data.shape[1] score_matrix = np.zeros((n, n)) pvalue_matrix = np.ones((n, n)) keys = stock_data.keys() pairs = [] for i in range(n): for j in range(i + 1, n): S1 = stock_data[keys[i]] S2 = stock_data[keys[j]] result = stattools.coint(S1, S2) score = result[0] pvalue = result[1] score_matrix[i, j] = score pvalue_matrix[i, j] = pvalue if pvalue < significance: pairs.append({(keys[i], keys[j]): pvalue}) return score_matrix, pvalue_matrix, pairs
def calibrate(self, prices): # identify valid pairs possible_pairs = self.get_possible_pairs(prices) valid, betas = [], [] for pair in possible_pairs: # Engle-Granger Test log_1, log_2 = np.log(prices[list(pair)])._series.values() if ts.coint(log_1, log_2)[1] > 0.05: continue # Augmented Dickey-Fuller Test alpha, beta = cointegrate(log_1, log_2) spread = log_1 - beta * log_2 - alpha if ts.adfuller(spread)[1] > 0.05: continue # Mean Reversion Test k = get_reversion_speed(spread) if k < 0.2: continue valid.append(pair) # remove invalid live pairs invalid = set(self.pairs.keys()) - set(valid) orders = [] for pair in invalid: positions = np.array(self.pairs[pair]['positions']) if np.absolute(positions).sum() > 0: orders += list(zip(pair, positions * -1)) del self.pairs[pair] # add new valid pairs for pair in valid: if pair not in self.pairs: training_data = np.log(prices[list(pair)].values[:-1]) self.pairs[pair] = { 'positions': [0, 0], 'state': 'neutral', 'filter': Kalman_Filter(training_data) } return orders
def find_all_pairs(days: int, data_dir: str='StockData', corr_value=.9, p_value=.05, candle_type: str='close'): data_dir = Path.joinpath(Path.cwd(), data_dir) data_file_names = ticker_list(data_dir) data_file_names = data_file_names[:45] # to not test 124000 possibilities in ~500 stock list cointegrated = [] for i in range(len(data_file_names)): for j in range(i + 1, len(data_file_names)): data_list_1 = ranged_price_list(data_file_names[i], days, candle_type) data_list_2 = ranged_price_list(data_file_names[j], days, candle_type) # correlation test if corr_value < pearson_coor(data_list_1, data_list_2): coint_value = coint(data_list_1, data_list_2)[1] # cointegrated test if coint_value < p_value: cointegrated.append( (coint_value, data_file_names[i].name, data_file_names[j].name)) return cointegrated
def EG_coin_result(a, b): X = table1.loc[a][:2016] Y = table1.loc[b][:2016] #the data is from 2005 to 2015, 2517 days, there are 252 trading days in a year, #define the first 8 years as training period, 2016 days #start = datetime.datetime(2005, 1, 1) #end = datetime.datetime(2009, 12, 31) #mask = (data['date2'] > start) & (data['date2'] <= end) #X=X.loc[mask] #Y=Y.loc[mask] #result = pd.merge(X, Y, on='date2') #x1=result['prc_x'] #closing price #y1=result['prc_y'] #closing price ''' #calculate price with 100 newprc_x=pd.Series() newprc_x.set_value(result['date2'][0],100) for i in range(1,x1.size): newprc_x.set_value(result['date2'][i],newprc_x[i-1]*(result['ret_x'][i-1]+1)) newprc_y=pd.Series(index=result['date2']) newprc_y.set_value(result['date2'][0],100) for i in range(1,y1.size): newprc_y.set_value(result['date2'][i],newprc_y[i-1]*(result['ret_y'][i-1]+1)) ''' #print ("Test 1: Engle Granger") try: coin_result = ts.coint(X, Y) #print("p-value: ",coin_result[1]) #print('pair: ',a,'&',b) if (coin_result[1] < 0.05 and not coin_result[1] == 0): #print("passed") passed = True #print("p value:",coin_result[1]) else: #print("not passed") passed = False except: passed = False return passed
def perform_coint(start_pointer, end_pointer): stock_series = get_data(start_pointer, end_pointer) p_values = [] coint_sec = [] tickers = [ 'XOM', 'RDS.A', 'CVX', 'TOT', 'BP', 'PTR', 'SNP', 'SLB', 'EPD', 'E', 'COP', 'EQNR', 'EOG', 'PBR', 'CEO', 'SU', 'OXY', 'HAL' ] for i in range(len(tickers)): p_values.append([0] * len(tickers)) coint_sec.append([0] * len(tickers)) for i in range(0, len(tickers)): for j in range(0, len(tickers)): if (i < j): p_values[i][j] = (ts.coint(stock_series[tickers[i]], stock_series[tickers[j]]))[1] else: p_values[i][j] = 0.5 threshold = 0.005 pair_header = ['Stock 1', 'Stock 2'] pairs = [] pairs.append(pair_header) for i in range(0, len(tickers)): for j in range(0, len(tickers)): if (i < j): if (p_values[i][j] < threshold): coint_sec[i][j] = 1 pairs.append([tickers[i], tickers[j]]) else: coint_sec[i][j] = 0 else: p_values[i][j] = 0 return tickers, pairs, p_values, coint_sec
def Analyze_Data( data): # check for cointegration score, pvalue, _ = coint(data.x1, data.x2) # test for stationary stationary, station_pvalue = check_for_stationarity(data.Z) data.stationary = stationary data.coi_pvalue = pvalue data.stn_pvalue = station_pvalue num = '{:2.3f}' if pvalue < 0.05: # if pvalue < 0.05 and stationary and data.b > 0 and data.b < 3: # if pvalue < 0.05 and data.b > 0 and stationary: data.trade_signal = True data.x1_symbol = symbols[1][data.i1] data.x2_symbol = symbols[1][data.i2] if zscore(data.Z).iloc[-1] > 0: data.x1_signal = trade_type.BUY data.x2_signal = trade_type.SELL elif zscore(data.Z).iloc[-1] < 0: data.x1_signal = trade_type.SELL data.x2_signal = trade_type.BUY text1 = 'Cointegration between '+ data.x1_symbol + 'and', data.x2_symbol + 'with p-value =', num.format(data.coi_pvalue) text2 = 'Beta (b) is '+ num.format(data.b) text3 = 'Spread is stationary with pvalue '+ num.format(data.stn_pvalue) text4 = 'spread max = '+ num.format(zscore(data.Z).max()) text5 = 'spread min = '+ num.format(zscore(data.Z).min()) text6 = 'current spread value ='+ num.format(zscore(data.Z).iloc[-1]) print(text1,'\n',text2,'\n',text3,'\n',text4,'\n',text5,'\n',text6) return data
def get_conint_params_2(ts1, ts2, window=30): if len(ts1) != len(ts2): return False else: df = pd.DataFrame([]) df['ts_1'] = ts1 df['ts_2'] = ts2 # Calculate optimal hedge ratio "beta" df = df.fillna(method='ffill').dropna() df['ratio'] = [y / x for x, y in zip(ts1, ts2)] score, pvalue, _ = coint(ts1, ts2) #compute 21-minute rolling residual res_norm = zscore(df['ratio'], window).rolling(5).median() print("P: " + str(pvalue)) #if pvalue < 0.3:#cadf[1] <= 0.1 or plt.plot(res_norm) plt.show() plt.plot(df['ratio']) plt.show() return pvalue
def perform_cointegration(self, company1: str, company2: str) -> CointegrationResult: """ Perform cointegration :param company1: company 1 in the list of companies :type company1: str :param company2: company 1 in the list of companies :type company2: str :return: t-statistic of unit-root test on residuals :rtype: float :return: p-value :rtype: float :return: Critical values for the test statistic at the 1%, 5%, and 10% levels :rtype: np.array """ if company1 not in self.companies or company2 not in self.companies: raise ValueError( 'Input companies: {}, {} must be in the list of companies used to construct ' 'with: {}'.format(company1, company2, self.companies)) coint_t, pvalue, crit = coint(self.closing_prices[company1], self.closing_prices[company2]) return CointegrationResult(coint_t=coint_t, pvalue=pvalue, crit=crit)
def plotpairs(clA, clB): clA, clB = loaddata() # 绘图展示 plt.figure(1) plt.plot(clA, label='EWA', color='b') plt.plot(clB, label='EWC', color='g') plt.legend() plt.figure(2) plt.scatter(clA, clB) ## 线性回归 clA = sm.add_constant(clA) regression_result = sm.OLS(clB, clA).fit() hedgeRatio = regression_result.params[1] ## 确定对冲比例 # 先还原clA clA = clA[:, 1] clC = clB - hedgeRatio * clA plt.figure(3) plt.plot(clC) ## 检验协整 results = coint(clA, clB) """
def find_cointegrated_pairs(df_array): # Takes in an array of dataframes, outputs cointegrated pairs for df in df_array: n = df.shape[1] score_matrix = np.zeros((n, n)) pvalue_matrix = np.ones((n, n)) keys = df.keys() pairs = [] total_stocks = n total_tests = ((total_stocks)*(total_stocks - 1))/2 # n choose 2 total combinations for i in range(n): for j in range(i+1, n): S1 = df[keys[i]] S2 = df[keys[j]] result = ts.coint(S1, S2) score = result[0] pvalue = result[1] score_matrix[i, j] = score pvalue_matrix[i, j] = pvalue if pvalue < (0.05): # Applying Bonferroni correction pairs.append((keys[i], keys[j])) total_stocks = 0 # print pairs return pairs
def find_cointegrated_pairs(securities_panel): ''' Function to find out best pairs w.r.t co-integration :param securities_panel: panel with closing prices per security :return: metrix with pvalue of co-integration ''' n = len(securities_panel.columns) score_matrix = np.zeros((n, n)) pvalue_matrix = np.ones((n, n)) keys = securities_panel.keys pairs = [] for i in range(n): for j in range(i+1, n): S1 = securities_panel.iloc[:,i] S2 = securities_panel.iloc[:,j] result = coint(S1, S2) score = result[0] pvalue = result[1] score_matrix[i, j] = score pvalue_matrix[i, j] = pvalue if pvalue < 0.01: pairs.append((securities_panel.columns[i], securities_panel.columns[j],pvalue)) return score_matrix, pvalue_matrix, pairs
def find_cointegrated_pairs(data: pd.DataFrame) -> Tuple[np.ndarray, np.ndarray, List[Tuple]]: """ This function will look for pairs of cointegrated stocks. :param data: pd.DataFrame :return: """ n = data.shape[1] score_matrix = np.zeros((n, n)) pvalue_matrix = np.ones((n, n)) keys = data.keys() # We store the stock pairs that are likely to be cointegrated pairs = [] progress_bar = tqdm(total=n**2/2, desc='Running cointegration tests...') for i in range(n): for j in range(i+1, n): S1: pd.Series = data[keys[i]] # values from first column S2: pd.Series = data[keys[j]] # values from second column result: Tuple = coint(S1, S2) # level of cointegration score: float = result[0] # t-score pvalue: float = result[1] # p-value score_matrix[i, j] = score # add coint scores to score_matrix pvalue_matrix[i, j] = pvalue # add pvalues to pvalue_matrix # if the p-value is less than the tresh, append to list if pvalue < 0.02: pairs.append((keys[i], keys[j])) progress_bar.update(1) progress_bar.close() return score_matrix, pvalue_matrix, pairs
def main(): # set starting variables (only need to manuably set tickers) endDate = date.today() startDate = endDate - datetime.timedelta(days=5 * 365) tickers = ['SMH', 'ARKK', 'XLK', 'QQQ', 'AAPL', 'MSFT', 'TSLA', 'ORCL', 'QCOM', 'AMD', 'UBER', 'SQ'] # get data for each ticker data = pdr.get_data_yahoo(tickers, start=startDate, end=endDate) prices = data["Adj Close"].dropna(axis='columns') # set up data for test keysList = prices.keys() pValMax = 0.2 pairsList = [] print(f"\n{str(len(keysList))} tickers span a valid backtest with {int((len(keysList) * (len(keysList) - 1)) / 2)} possible pair(s).") # run cointegration test on all possible pairs for i in range(len(keysList)): for j in range(i + 1, len(keysList)): result = coint(keysList[i], keysList[j]) pvalue = result[1] if(pvalue < pValMax): corr = np.corrcoef(keysList[i], keysList[j]) pairsList.append( (keysList[i], keysList[j], pvalue, corr[0][1])) pairsList = sorted(pairsList, key=itemgetter(3), reverse=True) print(f"{len(pairsList)} possible cointegrated pairs with p-values less than {str(pValMax)}:") # print out valid pairs with sufficient p-value for pair in pairsList: print(f"\n {pair[0]} and {pair[1]}:") print(f"p-value = {round(pair[2], 4)}") print(f"correlation coefficient = {round(pair[3], 4)}")
def coin_test(): start_date = 1580515200000 # 1 January 2018, 00:00:00 end_date = 1585612800000 # 31 May 2018, 23:59:59 assets = [ 'EOSUSD', 'BTCUSD', 'ETHUSD', 'LTCUSD', 'TRXUSD', 'NEOUSD', 'ETCUSD', 'XLMUSD' ] crypto_prices = pd.DataFrame() print("\n\nStaring cointest function............\n") for a in assets: print('Downloading ' + a) crypto_prices[a] = get_bitfinex_asset(asset=a, ts_ms_start=start_date, ts_ms_end=end_date) crypto_prices.head() # Normalize prices by first value norm_prices = crypto_prices.divide(crypto_prices.iloc[0]) print(f'\n\n\nPrinting norm prices....\n\n{norm_prices}\n\n') plt.figure(figsize=(15, 10)) plt.plot(norm_prices) plt.xlabel('days') plt.title('Performance of cryptocurrencies') plt.legend(assets) plt.show() df_dic = {'asset pairs': [], 'test result': []} for a1 in crypto_prices.columns: for a2 in crypto_prices.columns: if a1 != a2: test_result = ts.coint(crypto_prices[a1], crypto_prices[a2]) # print(a1 + ' and ' + a2 + ': p-value = ' + str(test_result[1])) df_dic['asset pairs'].append(a1 + ' and ' + a2) df_dic['test result'].append(test_result[1])
def find_cointegrated_pairs(securities): # Quantopian function to compare pairs of securities; revised to use # pd.Series instead of pd.DataPanel if type(securities) is not pd.Series: print('type is %s but should be pd.Series' %type(securities)) return n = len(securities.index) score_matrix = np.zeros((n, n)) pvalue_matrix = np.ones((n, n)) keys = securities.index pairs = [] for i in range(n): for j in range(i+1, n): S1 = securities[i] S2 = securities[j] S1, S2 = return_timelocked(S1, S2) result = coint(S1, S2) score = result[0] pvalue = result[1] score_matrix[i, j] = score pvalue_matrix[i, j] = pvalue if pvalue < 0.05: pairs.append((keys[i], keys[j])) return score_matrix, pvalue_matrix, pairs
def find_cointegrated_pairs(securities_panel): ''' Function to find out best pairs w.r.t co-integration :param securities_panel: panel with closing prices per security :return: metrix with pvalue of co-integration ''' n = len(securities_panel.columns) score_matrix = np.zeros((n, n)) pvalue_matrix = np.ones((n, n)) keys = securities_panel.keys pairs = [] for i in range(n): for j in range(i + 1, n): S1 = securities_panel.iloc[:, i] S2 = securities_panel.iloc[:, j] result = coint(S1, S2) score = result[0] pvalue = result[1] score_matrix[i, j] = score pvalue_matrix[i, j] = pvalue if pvalue < 0.01: pairs.append((securities_panel.columns[i], securities_panel.columns[j], pvalue)) return score_matrix, pvalue_matrix, pairs
def find_cointegrated_pairs(data): import numpy as np import pandas as pd import statsmodels import statsmodels.api as sm from statsmodels.tsa.stattools import coint n = data.shape[1] score_matrix = np.zeros((n, n)) pvalue_matrix = np.ones((n, n)) keys = data.keys() pairs = [] for i in range(n): for j in range(i + 1, n): S1 = data[keys[i]] S2 = data[keys[j]] result = coint(S1, S2) score = result[0] pvalue = result[1] score_matrix[i, j] = score pvalue_matrix[i, j] = pvalue if pvalue < 0.05: pairs.append((keys[i], keys[j])) return score_matrix, pvalue_matrix, pairs
def find_cointegrated_pairs(securities_panel): n = len(securities_panel.minor_axis) score_matrix = np.zeros((n, n)) pvalue_matrix = np.ones((n, n)) keys = securities_panel.keys pairs = [] # Cycles through all combinations of two securities and checks cointegration on each for i in range(n): for j in range(i + 1, n): S1 = securities_panel.minor_xs(securities_panel.minor_axis[i]) S2 = securities_panel.minor_xs(securities_panel.minor_axis[j]) result = coint(S1, S2) score = result[0] pvalue = result[1] score_matrix[i, j] = score pvalue_matrix[i, j] = pvalue # Returns statistically significant pairs if pvalue < 0.05: pairs.append((securities_panel.minor_axis[i], securities_panel.minor_axis[j])) return score_matrix, pvalue_matrix, pairs
stock_prices.drop('date', axis = 1, inplace = True) stock_prices = stock_prices.iloc[::-1] stock_returns = stock_prices.pct_change() stock_returns.drop('2013-07-31',axis=0,inplace = True) #calculate stocks' daily returns #stock_returns = stock_returns.interpolate().dropna(axis=1) stock_prices = stock_prices.interpolate().dropna(axis=1) stock_list = list(stock_prices.columns.values) pairs = [] #define pairs as list #test cointegration of the stocks for i in range(0,len(stock_list),1): for j in range(i+1, len(stock_list),1): results = coint(stock_prices.iloc[:,i],stock_prices.iloc[:,j]) if results[1]< 0.01: #get the pvalues pairs.append([stock_list[i],stock_list[j],results[1]]) pairs_selected = [] #test difference of stock prices using ADF test for i in range(0,len(pairs),1): results = sts.adfuller(stock_prices[pairs[i][0]]-stock_prices[pairs[i][1]],1) if results[1] < 0.05: pairs_selected.append(pairs[i]) #get the smallest p-value pair in 'pairs_selected' pairs_selected.sort(key=itemgetter(2)) #ascending order thus the first two stocks are chosen stock_1 = pairs_selected[0][0] #China State Ship Building
def do_coint_tests( dataframe: pd.DataFrame, pvalue_threshold: float = 0.04, coint_dir: Path=Path('tmp')) -> Tuple[pd.DataFrame, pd.DataFrame, List]: """ Perform coint test between all series. Also writes the nested dictionaries to file. By default, the function will write the nested dictionaries to: - scores.json - pvalue.json :param dataframe: pandas DataFrame, where each series is a stock :param pvalue_threshold: float value representing threshold for pvalue to be considered cointegrated :return: Tuple of pandas DataFrames --> (df_scores, df_pvalue) """ # big dictionary to keep track of values scores_dict: Dict[str, Dict[str, float]] = {} pvalue_dict: Dict[str, Dict[str, float]] = {} pairs: List[Tuple[str, str]] = [] # iter through columns num_columns = len(dataframe.columns) pbar = tqdm(total=int(num_columns**2/2)-int(num_columns/2), desc='Performing Cointegration tests between columns') for i, (name1, s1) in enumerate(dataframe.iteritems()): for j, (name2, s2) in enumerate(dataframe.iloc[:, i+1:].iteritems()): # get coint results between two columns score, pvalue, _ = coint(s1, s2) # add scores and pvalues to dictionaries update_coint_dict(scores_dict, score, name1, name2) update_coint_dict(pvalue_dict, pvalue, name1, name2) # if the p-value is less than the tresh, append to list if pvalue < pvalue_threshold: pairs.append((name1, name2)) pbar.update(1) coint_dir.resolve() coint_dir.mkdir(parents=True, exist_ok=True) with (coint_dir / 'scores.json').open('w') as f: json.dump(scores_dict, f, indent=4) # with open('scores.json', 'w') as f: # json.dump(scores_dict, f, indent=4) with (coint_dir / 'pvalue.json').open('w') as f: json.dump(scores_dict, f, indent=4) # with open('pvalue.json', 'w') as f: # json.dump(pvalue_dict, f, indent=4) with (coint_dir / 'pairs.txt').open('w') as f: for name1, name2 in pairs: f.write('{},{}\n'.format(name1, name2)) # with open('pairs.txt', 'w') as f: # for name1, name2 in pairs: # f.write('{},{}\n'.format(name1, name2)) df_scores = dataframe_from_coint_dict(scores_dict) df_pvalue = dataframe_from_coint_dict(pvalue_dict) return df_scores, df_pvalue, pairs
if analysis_type < 2: # Calculate and plot price correlations. pearson_corr = df[tickers].corr() sns.clustermap(pearson_corr).fig.suptitle('Pearson Correlations') if analysis_type == 1: plt.show() if analysis_type != 1: # Plot the marginal distributions. sns.set(style='darkgrid') sns.jointplot(df[pair_1], df[pair_2], kind='hex', color='#2874A6') # Calculate the p-value of cointegration test. x = df[pair_1] y = df[pair_2] _, p_value, _ = coint(x, y) print('The p_value of pair cointegration is: {}'.format(p_value)) # Plot the linear relationship of the EURJPY-GBPJPY pair. df2 = df[[pair_1, pair_2]].copy() spread = df2[pair_1] - df2[pair_2] mean_spread = spread.mean() df2['Dev'] = spread - mean_spread rnd = np.random.choice(len(df), size=500) sns.scatterplot(x=pair_1, y=pair_2, hue='Dev', linewidth=0.3, alpha=0.8, data=df2.iloc[rnd, :]).set_title( '%s-%s Price Relationship' % (pair_1, pair_2))
# Plot the BIC as a function of p plt.plot(range(1, 7), BIC[1:7], marker='o') plt.xlabel('Order of AR Model') plt.ylabel('Baysian Information Criterion') plt.show() ###### COINTEGRATION from statsmodels.tsa.stattools import coint, adfuller from statsmodels.api import OLS, add_constant P = df5['Adj Close_AMZN_Actual'] Q = df5['Adj Close_MSFT_Actual'] coint(P,Q) ols = OLS(Q, add_constant(P)).fit() p_val = ols.params[1] ad_test = adfuller(Q-p_val*P)
def get_p_value(x, y): _, p_val, _ = coint(x, y) return p_val
plt.axhline((Y/X).mean(), color='red', linestyle='--') plt.xlabel('Time') plt.legend(['Price Ratio', 'Mean']) plt.show() """Here is a plot of the ratio between the two two series. Notice how it tends to revert back to the mean? This is a clear sign of cointegration. ## Cointegration Test You now know what it means for two stocks to be cointegrated, but how do we actually quantify and test for cointegration? The module statsmodels has a good cointegration test that outputs a t-score and a p-value. It's a lot of statistical mumbo-jumbo that shows us the probability that we get a certain value given the distribution. In the end, we want to see a low p-value, ideally less than 5%, to give us a clear indicator that the pair of stocks are very likely to be cointegrated. """ score, pvalue, _ = coint(X,Y) print(pvalue) # Low pvalue means high cointegration! """### Clarification In case you are a bit on the ropes regarding the difference between correlation and cointegration, let me show you some pictures that will make the distinction between correlation and cointegration clear. """ ret1 = np.random.normal(1, 1, 100) ret2 = np.random.normal(2, 1, 100) s1 = pd.Series(np.cumsum(ret1), name='X_divering') s2 = pd.Series(np.cumsum(ret2), name='Y_diverging') pd.concat([s1, s2], axis=1).plot(figsize=(15, 7))
def testForCointegrationJohansen(self, series1,series2): a = coint(series1,series2, "ct") return a
MR_one = mean_revert_FX_one.mean_revert_algo(eurusd,gbpusd,'EUR','UK','US') eurusd_gbpusd = MR_one.merging_func() fig, ax1 = plt.subplots() x=eurusd_gbpusd['Date'] y1=eurusd_gbpusd['Price_x'] y2=eurusd_gbpusd['Price_y'] ax2 = ax1.twinx() ax1.plot(x, y1, 'g-') ax1.plot(x, y2, 'b-') eurusd_gbpusd.Price_x = pd.to_numeric(eurusd_gbpusd.Price_x) eurusd_gbpusd.Price_y = pd.to_numeric(eurusd_gbpusd.Price_y) ts.adfuller(eurusd_gbpusd['Price_y']) result = ts.coint(eurusd_gbpusd['Price_x'],eurusd_gbpusd['Price_y']) eurusd_gbpusd_test = eurusd_gbpusd[['Price_x','Price_y']] coint_johansen(eurusd_gbpusd_test, -1, 1) ### Analyzing spreads and ratios ### eurusd_gbpusd['Price_diff'] = eurusd_gbpusd.Price_x - eurusd_gbpusd.Price_y eurusd_gbpusd['Log_ret'] = np.log(eurusd_gbpusd.Price_x/eurusd_gbpusd.Price_y) eurusd_gbpusd['ret'] = eurusd_gbpusd.Price_x/eurusd_gbpusd.Price_y eurusd_gbpusd.plot('Date','ret') plt.show() ########################################
def coint_similar(symbol, sc=slice(0, 2), show=True): """ TODO: 稳定后 slice赋值给变量,sc直接=相对应的变量 """ pd_list = get_pdlist(sc) sum_rank = get_sum_rank(pd_list, symbol) rank_head = sum_rank.sort_values(ascending=True)[1:100] kl_pd = SymbolPd.make_kfold_pd(symbol, n_folds=1) mul_pd = SymbolPd.make_kfold_mulpd(rank_head.index.tolist(), n_folds=1) coint_dict = {} for ind, cmp_symbol in enumerate(rank_head.index): klpd_cmp = mul_pd[cmp_symbol] if klpd_cmp is None: continue _, pvalue, _ = coint(kl_pd.close, klpd_cmp.close) if pvalue < 0.08: """ 记录index为了发现取多少个sort_values(ascending=True)[1:100]能 有良好的数据 """ coint_dict[cmp_symbol] = (pvalue, ind) p_value_sorted = sorted(zip(coint_dict.values(), coint_dict.keys())) cmp_cnt = np.minimum(len(p_value_sorted), 10) symbols = [item[1] for item in p_value_sorted[:cmp_cnt]] mul_pd_it = mul_pd.swapaxes('items', 'minor') sd = mul_pd_it.items.tolist() sd.remove('close') """ 为了得到三维面板中干净的close列 """ close_panel = mul_pd_it.drop(sd) close_panel_pd = close_panel.loc['close'][symbols] if show: close_panel_pd_regular = NpUtil.regular_std(close_panel_pd) close_panel_pd_regular.plot() plt.title('close panel pd regular') plt.legend(bbox_to_anchor=(1.05, 1), loc=2, borderaxespad=0.) plt.show() close_panel_pd_cp = copy.deepcopy(close_panel_pd) for col in close_panel_pd_cp.columns: """ 做一个一摸一样的pd就是为了得到投票权重表便于运算 """ close_panel_pd_cp[col] = kl_pd.close regular_diff = NpUtil.regular_std(close_panel_pd_cp - close_panel_pd) if show: regular_diff.plot() plt.title('regular diff') plt.legend(bbox_to_anchor=(1.05, 1), loc=2, borderaxespad=0.) plt.show() """ 类似投票机制,获取投票coint的差值 distance_votes 2015-07-27 -14.724491 2015-07-28 -12.712066 2015-07-29 -11.945266 2015-07-30 -13.801350 2015-07-31 -13.520431 2015-08-03 -11.381343 2015-08-04 -9.486645 2015-08-05 -11.319338 2015-08-06 -6.517725 2015-08-07 -9.103014 2015-08-10 -5.025694 ...................... """ distance_votes = regular_diff.sum(axis=1) votes_std = distance_votes.std() votes_mean = distance_votes.mean() above = votes_mean + votes_std below = votes_mean - votes_std if show: close_regular = NpUtil.regular_std(kl_pd.close) close_regular = (close_regular * distance_votes.max() / 2) close_regular.plot() distance_votes.plot() plt.axhline(votes_mean, color='r') plt.axhline(above, color='c') plt.axhline(below, color='g') plt.title('coint distance votes') plt.legend(['close regular', 'distance votes', 'votes mean', 'dvotes above', 'dvotes below'], bbox_to_anchor=(1.05, 1), loc=2, borderaxespad=0.) plt.show()
def __init__(self): self.coint_t = coint(self.y1, self.y2, regression ="c")[0] self.teststat = -1.8208817
import pandas as pd import statsmodels.tsa.stattools as ts df = pd.read_excel('data.xlsx', index_col='Dates') df['GTII10'] = df['GTII10']*-1 date_1, date_2 = '2016-01-01', '2017-01-01' df_filtered = df.loc[date_1:date_2] print(ts.coint(df_filtered['GTII10'], df_filtered['GOLDS'])[1])
parse_dates='Date', index_col='Date') X = data['ADANIPOWER'] Y = data['JSWENERGY'] Y.tail(5) X = X['2014-01-01':'2014-12-31'] Y = Y['2014-01-01':'2014-12-31'] pd.concat([X, Y], axis=1).plot() (Y / X).plot() plt.axhline((Y / X).mean(), color='red', linestyle='--') score, pvalue, _ = coint(X, Y) print pvalue def zscore(series): return (series - series.mean()) / np.std(series) ratio_series = Y / X zscore(ratio_series).plot() plt.axhline(zscore(ratio_series).mean(), color='black') plt.axhline(1.0, color='red', linestyle='--') plt.axhline(-1.0, color='green', linestyle='--') ratio = Y / X ratio.name = 'ratio'
def setup_class(cls): #cls.coint_t = coint(cls.y1, cls.y2, trend="c")[0] cls.coint_t = coint(cls.y1, cls.y2, trend="c", maxlag=0, autolag=None)[0] cls.teststat = -1.8208817 cls.teststat = -1.830170986148
def checkPortfolioForCoint(self, critValue=0.01, fromDate="2015-01-01", toDate="2020-09-21", calcMean=False, usead=0): """ This is one of the most useful function in this class :param critValue: The critical value of which to judge things are cointegrated, equivalent to the p value cut off :param fromDate: The date to start the check of cointegration test from :param toDate: The date to end the check of cointegration test :param calcMean: Sometimes it is handy to return the mean of the cointegrated data, this gives a boolean for doing that :param usead: I tested other cointegration methods and this allows you to adjust which one to use :return: The dataframe of (symbol1, symbol2, pvalue), so the 2 cointegrated stocks and the certainty of which they are cointegrated """ start_time = time.time() num_stocks = len(self.portfolio) keys = list(self.portfolio.keys()) df = pd.DataFrame(columns=('symbol1', 'symbol2', 'pvalue')) for i in range(num_stocks): for j in range(i + 1, num_stocks): data1 = self.portfolio[keys[i]][ self.analysisOn][fromDate:toDate] data2 = self.portfolio[keys[j]][ self.analysisOn][fromDate:toDate] model = sm.OLS(data1, sm.add_constant(data2)) results = model.fit() spread = data1 - results.params[1] * data2 - results.params[0] if results.params[1] < 0: continue try: if not usead: result = coint(data1, data2) else: result = adfuller(spread) # else: # df = pd.DataFrame({'data1': data1, 'data2': data2}) # result = coint_johansen(df, 0, 1) # print("Critical values(90%, 95%, 99%) of max_eig_stat\n", result.cvm, '\n') # print("Critical values(90%, 95%, 99%) of trace_stat\n", result.cvt, '\n') except: print("Cannot calculate coint for {}, {}".format( keys[i], keys[j])) continue pvalue = result[1] if pvalue < critValue: mean = np.mean(spread) std = np.std(spread) df = df.append( { 'symbol1': keys[i], 'symbol2': keys[j], 'pvalue': pvalue, 'mean': mean, 'std': std }, ignore_index=True) # At some point add sm.OLS HERE df = df.sort_values(by='pvalue', ignore_index=True) self.cointStocks = df print("checkPortfolioForCoint took {} seconds".format(time.time() - start_time)) if calcMean == False: return df
def coint_similar(symbol, sum_rank=None, corr_jobs=(ECoreCorrType.E_CORE_TYPE_PEARS, ECoreCorrType.E_CORE_TYPE_SPERM), show=True): """ 首先找到的是最相关的top个,从top n个最相关的再找协整,只考虑pvalue,因为已经是从top n个最相关的再找协整 可视化整个过程 :param symbol: eg: 'usTSLA' :param sum_rank: 已经缓存了的sum_rank数据, eg: sum_rank usBIDU 10.0 usFB 16.0 usGOOG 12.0 usNOAH 2.0 usSFUN 14.0 usTSLA 18.0 usVIPS 6.0 usWUBA 8.0 :param corr_jobs: 默认:corr_jobs=(ECoreCorrType.E_CORE_TYPE_PEARS, ECoreCorrType.E_CORE_TYPE_SPERM) 可以再添加更多jobs eg: corr_jobs=(ECoreCorrType.E_CORE_TYPE_PEARS, ECoreCorrType.E_CORE_TYPE_SPERM, ECoreCorrType.E_CORE_TYPE_SIGN, ECoreCorrType.E_CORE_TYPE_ROLLING) 注意每添加一种相关计算方法,耗时都会增加 :param show: 是否进行可视化 """ cs = code_to_symbol(symbol) symbol = cs.value if sum_rank is None: tmp_market = ABuEnv.g_market_target # 强制把市场设置为一样的 ABuEnv.g_market_target = cs.market corr_df_dict = ABuSimilar.multi_corr_df(corr_jobs) # 恢复之前的市场 ABuEnv.g_market_target = tmp_market """ eg: corr_df_dict {'pears': usBIDU usFB usGOOG usNOAH usSFUN usTSLA usVIPS usWUBA usBIDU 1.0000 0.3013 0.3690 0.4015 0.3680 0.3015 0.3706 0.4320 usFB 0.3013 1.0000 0.6609 0.2746 0.1978 0.4080 0.2856 0.2438 usGOOG 0.3690 0.6609 1.0000 0.3682 0.1821 0.3477 0.3040 0.2917 usNOAH 0.4015 0.2746 0.3682 1.0000 0.3628 0.2178 0.4645 0.4488 usSFUN 0.3680 0.1978 0.1821 0.3628 1.0000 0.2513 0.2843 0.4883 usTSLA 0.3015 0.4080 0.3477 0.2178 0.2513 1.0000 0.2327 0.3340 usVIPS 0.3706 0.2856 0.3040 0.4645 0.2843 0.2327 1.0000 0.4189 usWUBA 0.4320 0.2438 0.2917 0.4488 0.4883 0.3340 0.4189 1.0000 'sperm': usBIDU usFB usGOOG usNOAH usSFUN usTSLA usVIPS usWUBA usBIDU 1.0000 0.3888 0.4549 0.4184 0.3747 0.3623 0.4333 0.4396 usFB 0.3888 1.0000 0.7013 0.2927 0.2379 0.4200 0.3123 0.2216 usGOOG 0.4549 0.7013 1.0000 0.3797 0.2413 0.3871 0.3922 0.3035 usNOAH 0.4184 0.2927 0.3797 1.0000 0.3581 0.2066 0.4643 0.4382 usSFUN 0.3747 0.2379 0.2413 0.3581 1.0000 0.2645 0.3890 0.4693 usTSLA 0.3623 0.4200 0.3871 0.2066 0.2645 1.0000 0.2540 0.2801 usVIPS 0.4333 0.3123 0.3922 0.4643 0.3890 0.2540 1.0000 0.4080 usWUBA 0.4396 0.2216 0.3035 0.4382 0.4693 0.2801 0.4080 1.0000 } """ sum_rank = rank_corr_sum(corr_df_dict, symbol) """ eg: sum_rank usBIDU 10.0 usFB 16.0 usGOOG 12.0 usNOAH 2.0 usSFUN 14.0 usTSLA 18.0 usVIPS 6.0 usWUBA 8.0 """ if sum_rank is None: logging.info('{} not in corr df!!!'.format(symbol)) return None, None top_cnt = sum_rank.shape[0] if g_top_corr_cnt > sum_rank.shape[0] else g_top_corr_cnt # 首先找到的是最相关的top个 rank_head = sum_rank.sort_values(ascending=True)[1:top_cnt] # 使用symbol做标尺 benchmark = AbuBenchmark(symbol, n_folds=1) # benchmark做为数据标尺获取最相关的top个金融时间数据 mul_pd = ABuSymbolPd.make_kl_df(rank_head.index, n_folds=1, data_mode=EMarketDataSplitMode.E_DATA_SPLIT_UNDO, benchmark=benchmark) coint_dict = {} for ind, cmp_symbol in enumerate(rank_head.index): if cmp_symbol not in mul_pd: continue klpd_cmp = mul_pd[cmp_symbol] if klpd_cmp is None: continue """ coint返回值三个如下: coint_t : float t-statistic of unit-root test on residuals pvalue : float MacKinnon's approximate p-value based on MacKinnon (1994) crit_value : dict Critical values for the test statistic at the 1 %, 5 %, and 10 % levels. 这里只考虑pvalue,因为已经是从top n个最相关的再找协整 """ _, pvalue, _ = coint(benchmark.kl_pd.close, klpd_cmp.close) if pvalue < g_coint_threshold: # pvalue小于阀值即加入coint_dict字典 # 记录ind为了发现取多少个sort_values(ascending=True)[1:g_top_corr_cnt]能有良好的数据 # 即为了事后调整g_top_corr_cnt使用,并非实际需要 coint_dict[cmp_symbol] = (pvalue, ind) p_value_sorted = sorted(zip(coint_dict.values(), coint_dict.keys())) if len(p_value_sorted) == 0: logging.info( 'len(p_value_sorted) == 0 please try change tl.similar.g_top_corr_cnt|tl.similar.g_coint_threshold!') return None, None if show: cmp_cnt = np.minimum(len(p_value_sorted), g_coint_show_max) # 只取item[1],[0]是ind symbols = [item[1] for item in p_value_sorted[:cmp_cnt]] mul_pd_swap = mul_pd.swapaxes('items', 'minor') close_panel_pd = mul_pd_swap['close'][symbols] """ 转轴后只取收盘价格序列 eg: close_panel_pd usFB usGOOG usNOAH usVIPS usWUBA us_NYSE:.IXIC 2015-07-24 96.95 623.56 23.40 20.250 65.25 5088.629 2015-07-27 94.17 627.26 22.16 19.990 62.89 5039.776 2015-07-28 95.29 628.00 22.94 20.200 60.32 5089.207 2015-07-29 96.99 631.93 23.35 20.260 59.89 5111.730 2015-07-30 95.21 632.59 22.87 19.700 60.24 5128.785 ... ... ... ... ... ... ... 2016-07-20 121.92 741.19 25.11 13.630 48.17 5089.930 2016-07-21 120.61 738.63 25.51 13.690 49.25 5073.900 2016-07-22 121.00 742.74 25.50 13.510 49.21 5100.160 2016-07-25 121.63 739.77 25.57 13.390 49.84 5097.628 2016-07-26 121.64 740.92 24.75 13.655 50.36 5084.629 """ # 将数据scale到一个级别上,注意使用mean_how=True,避免极值的干扰 close_panel_pd = ABuScalerUtil.scaler_matrix(close_panel_pd, mean_how=True) """ ABuScalerUtil.scaler_matrix缩放后的数据矩阵如下所示 eg: close_panel_pd usFB usGOOG usNOAH usVIPS usWUBA 2015-07-24 4451.7674 4311.1198 4477.3494 6601.2284 5980.4246 2015-07-27 4324.1148 4336.7006 4240.0882 6516.4719 5764.1211 2015-07-28 4375.5432 4341.8168 4389.3332 6584.9290 5528.5703 2015-07-29 4453.6041 4368.9877 4467.7825 6604.4882 5489.1591 ... ... ... ... ... ... 2016-07-20 5598.3443 5124.3808 4804.5404 4443.1972 4414.9740 2016-07-21 5538.1915 5106.6817 4881.0762 4462.7564 4513.9603 2016-07-22 5556.0995 5135.0971 4879.1628 4404.0788 4510.2942 2016-07-25 5585.0280 5114.5633 4892.5566 4364.9604 4568.0362 2016-07-26 5585.4872 5122.5141 4735.6581 4451.3468 4615.6963 """ # 可视化scaler_matrix操作后的close close_panel_pd.plot(figsize=ABuEnv.g_plt_figsize) plt.title('close panel pd scaler_matrix') plt.legend(bbox_to_anchor=(1.05, 1), loc=2, borderaxespad=0.) plt.show() close_panel_pd_cp = copy.deepcopy(close_panel_pd) for col in close_panel_pd_cp.columns: """ 做一个一摸一样的pd就是为了得到投票权重表便于运算: close_panel_pd_cp[col] = benchmark.kl_pd.close 将所有数据列都使用标尺的数据进行替换,结果是每一列的数据都相同, 比如这样,列数据都和标尺一样 usFB usGOOG usNOAH usVIPS usWUBA us_NYSE:.IXIC 2015-07-24 265.41 265.41 265.41 265.41 265.41 265.41 2015-07-27 253.01 253.01 253.01 253.01 253.01 253.01 2015-07-28 264.82 264.82 264.82 264.82 264.82 264.82 2015-07-29 263.82 263.82 263.82 263.82 263.82 263.82 2015-07-30 266.79 266.79 266.79 266.79 266.79 266.79 ... ... ... ... ... ... ... 2016-07-20 228.36 228.36 228.36 228.36 228.36 228.36 2016-07-21 220.50 220.50 220.50 220.50 220.50 220.50 2016-07-22 222.27 222.27 222.27 222.27 222.27 222.27 2016-07-25 230.01 230.01 230.01 230.01 230.01 230.01 2016-07-26 225.93 225.93 225.93 225.93 225.93 225.93 """ close_panel_pd_cp[col] = benchmark.kl_pd.close """ 将复刻后的close_panel_pd_cp与原始close_panel_pd求差后,再进行scaler_std ABuScalerUtil.scaler_std(close_panel_pd_cp - close_panel_pd): usFB usGOOG usNOAH usVIPS usWUBA us_NYSE:.IXIC 2015-07-24 0.9705 1.7793 0.7405 -1.6987 -1.9294 -1.0803 2015-07-27 1.2277 1.6619 1.1473 -1.6270 -1.5697 -0.8853 2015-07-28 1.1393 1.6826 0.8987 -1.6831 -1.1334 -1.0866 2015-07-29 0.9629 1.5955 0.7550 -1.7035 -1.0656 -1.2124 2015-07-30 1.1519 1.5906 0.9265 -1.5197 -1.1169 -1.2878 ... ... ... ... ... ... ... 2016-07-21 -1.5539 -0.8188 -0.0710 0.3755 0.5784 -1.2418 2016-07-22 -1.5899 -0.9012 -0.0644 0.4354 0.5879 -1.3728 2016-07-25 -1.6371 -0.8138 -0.0746 0.4819 0.4997 -1.3179 2016-07-26 -1.6473 -0.8509 0.2018 0.3922 0.4085 -1.2702 """ regular_diff = ABuScalerUtil.scaler_std(close_panel_pd_cp - close_panel_pd) regular_diff.plot(figsize=ABuEnv.g_plt_figsize) plt.title('regular diff') plt.legend(bbox_to_anchor=(1.05, 1), loc=2, borderaxespad=0.) plt.show() """ distance_votes = regular_diff.sum(axis=1): 投票机制,获取投票coint的差值 distance_votes distance_votes 2015-07-24 -1.2181 2015-07-27 -0.0451 2015-07-28 -0.1825 2015-07-29 -0.6682 2015-07-30 -0.2555 ... 2016-07-20 -2.5541 2016-07-21 -2.7316 2016-07-22 -2.9049 2016-07-25 -2.8618 2016-07-26 -2.7658 ...................... """ distance_votes = regular_diff.sum(axis=1) votes_std = distance_votes.std() votes_mean = distance_votes.mean() above = votes_mean + votes_std below = votes_mean - votes_std close_regular = ABuScalerUtil.scaler_std(benchmark.kl_pd.close) close_regular = (close_regular * distance_votes.max() / 2) with plt_show(): # noinspection PyUnresolvedReferences close_regular.plot() distance_votes.plot() plt.axhline(votes_mean, color='r') plt.axhline(above, color='c') plt.axhline(below, color='g') plt.title('coint distance votes') plt.legend(['close regular', 'distance votes', 'votes mean', 'dvotes above', 'dvotes below'], bbox_to_anchor=(1.05, 1), loc=2, borderaxespad=0.) return p_value_sorted, sum_rank
quotes = [] for idx, name in enumerate(names): quote = stock_data_by_name(conn, name) quote = quote[:115] series = pd.Series(quote['close'], dtype='float') s_dict = {'name': name, 'data': series} quotes.append(s_dict) stock_cb = list(combinations(quotes, 2)) print("stock combinations length : {}".format(len(stock_cb))) result = [] for stock_info in stock_cb: x = stock_info[0].get("data") x_name = stock_info[0].get("name") y = stock_info[1].get("data") y_name = stock_info[1].get("name") coin_result = ts.coint(x, y) adf = coin_result[0] p_value = coin_result[1] one_percent = coin_result[2][0] five_percent = coin_result[2][1] ten_percent = coin_result[2][2] if adf < 0.01 and adf < one_percent and adf < five_percent and adf < ten_percent: print("result: {}, stock tuple:{} ".format(coin_result, (x_name, y_name))) result.append((x_name, y_name)) print(len(result)) if __name__ == '__main__': print("execute over")
def test_coint(): nobs = 200 scale_e = 1 const = [1, 0, 0.5, 0] np.random.seed(123) unit = np.random.randn(nobs).cumsum() y = scale_e * np.random.randn(nobs, 4) y[:, :2] += unit[:, None] y += const y = np.round(y, 4) for trend in []:#['c', 'ct', 'ctt', 'nc']: print('\n', trend) print(coint(y[:, 0], y[:, 1], trend=trend, maxlag=4, autolag=None)) print(coint(y[:, 0], y[:, 1:3], trend=trend, maxlag=4, autolag=None)) print(coint(y[:, 0], y[:, 2:], trend=trend, maxlag=4, autolag=None)) print(coint(y[:, 0], y[:, 1:], trend=trend, maxlag=4, autolag=None)) # results from Stata egranger res_egranger = {} # trend = 'ct' res = res_egranger['ct'] = {} res[0] = [-5.615251442239, -4.406102369132, -3.82866685109, -3.532082997903] res[1] = [-5.63591313706, -4.758609717199, -4.179130554708, -3.880909696863] res[2] = [-2.892029275027, -4.758609717199, -4.179130554708, -3.880909696863] res[3] = [-5.626932544079, -5.08363327039, -4.502469783057, -4.2031051091] # trend = 'c' res = res_egranger['c'] = {} # first critical value res[0][1] has a discrepancy starting at 4th decimal res[0] = [-5.760696844656, -3.952043522638, -3.367006313729, -3.065831247948] # manually adjusted to have higher precision as in other cases res[0][1] = -3.952321293401682 res[1] = [-5.781087068772, -4.367111915942, -3.783961136005, -3.483501524709] res[2] = [-2.477444137366, -4.367111915942, -3.783961136005, -3.483501524709] res[3] = [-5.778205811661, -4.735249216434, -4.152738973763, -3.852480848968] # trend = 'ctt' res = res_egranger['ctt'] = {} res[0] = [-5.644431269946, -4.796038299708, -4.221469431008, -3.926472577178] res[1] = [-5.665691609506, -5.111158174219, -4.53317278104, -4.23601008516] res[2] = [-3.161462374828, -5.111158174219, -4.53317278104, -4.23601008516] res[3] = [-5.657904558563, -5.406880189412, -4.826111619543, -4.527090164875] # The following for 'nc' are only regression test numbers # trend = 'nc' not allowed in egranger # trend = 'nc' res = res_egranger['nc'] = {} nan = np.nan # shortcut for table res[0] = [-3.7146175989071137, nan, nan, nan] res[1] = [-3.8199323012888384, nan, nan, nan] res[2] = [-1.6865000791270679, nan, nan, nan] res[3] = [-3.7991270451873675, nan, nan, nan] for trend in ['c', 'ct', 'ctt', 'nc']: res1 = {} res1[0] = coint(y[:, 0], y[:, 1], trend=trend, maxlag=4, autolag=None) res1[1] = coint(y[:, 0], y[:, 1:3], trend=trend, maxlag=4, autolag=None) res1[2] = coint(y[:, 0], y[:, 2:], trend=trend, maxlag=4, autolag=None) res1[3] = coint(y[:, 0], y[:, 1:], trend=trend, maxlag=4, autolag=None) for i in range(4): res = res_egranger[trend] assert_allclose(res1[i][0], res[i][0], rtol=1e-11) r2 = res[i][1:] r1 = res1[i][2] assert_allclose(r1, r2, rtol=0, atol=6e-7)
def rebalance_pairs(context, data): if get_open_orders(): return prices = data.history(context.futures_list, 'price', context.long_ma, '1d') for future_y, future_x in context.futures_pairs: Y = prices[future_y] X = prices[future_x] y_log = np.log(Y) x_log = np.log(X) pvalue = coint(y_log, x_log)[1] if pvalue > 0.10: log.info( '({} {}) no longer cointegrated, no new positions.'.format( future_y.root_symbol, future_x.root_symbol, ), ) continue regression = sp.stats.linregress( x_log[-context.long_ma:], y_log[-context.long_ma:], ) spreads = Y - (regression.slope * X) zscore = (np.mean(spreads[-context.short_ma:]) - np.mean(spreads)) / np.std(spreads, ddof=1) future_y_contract, future_x_contract = data.current( [future_y, future_x], 'contract', ) context.current_weights[future_y_contract] = context.long_term_weights[ future_y_contract.root_symbol] context.current_weights[future_x_contract] = context.long_term_weights[ future_x_contract.root_symbol] hedge_ratio = regression.slope if context.inShort[(future_y.root_symbol, future_x.root_symbol)] and zscore < 0.0: context.long_term_weights[future_y_contract.root_symbol] = 0 context.long_term_weights[future_x_contract.root_symbol] = 0 context.current_weights[ future_y_contract] = context.long_term_weights[ future_y_contract.root_symbol] context.current_weights[ future_x_contract] = context.long_term_weights[ future_x_contract.root_symbol] context.inLong[(future_y.root_symbol, future_x.root_symbol)] = False context.inShort[(future_y.root_symbol, future_x.root_symbol)] = False continue if context.inLong[(future_y.root_symbol, future_x.root_symbol)] and zscore > 0.0: context.long_term_weights[future_y_contract.root_symbol] = 0 context.long_term_weights[future_x_contract.root_symbol] = 0 context.current_weights[ future_y_contract] = context.long_term_weights[ future_y_contract.root_symbol] context.current_weights[ future_x_contract] = context.long_term_weights[ future_x_contract.root_symbol] context.inLong[(future_y.root_symbol, future_x.root_symbol)] = False context.inShort[(future_y.root_symbol, future_x.root_symbol)] = False continue if zscore < -1.0 and (not context.inLong[(future_y.root_symbol, future_x.root_symbol)]): # Only trade if NOT already in a trade y_target_contracts = 1 x_target_contracts = hedge_ratio context.inLong[(future_y.root_symbol, future_x.root_symbol)] = True context.inShort[(future_y.root_symbol, future_x.root_symbol)] = False (y_target_pct, x_target_pct) = computeHoldingsPct( y_target_contracts, x_target_contracts, future_y_contract.multiplier * Y[-1], future_x_contract.multiplier * X[-1]) context.long_term_weights[ future_y_contract.root_symbol] = y_target_pct context.long_term_weights[ future_x_contract.root_symbol] = -x_target_pct context.current_weights[ future_y_contract] = context.long_term_weights[ future_y_contract.root_symbol] context.current_weights[ future_x_contract] = context.long_term_weights[ future_x_contract.root_symbol] continue if zscore > 1.0 and (not context.inShort[(future_y.root_symbol, future_x.root_symbol)]): # Only trade if NOT already in a trade y_target_contracts = 1 x_target_contracts = hedge_ratio context.inLong[(future_y.root_symbol, future_x.root_symbol)] = False context.inShort[(future_y.root_symbol, future_x.root_symbol)] = True (y_target_pct, x_target_pct) = computeHoldingsPct( y_target_contracts, x_target_contracts, future_y_contract.multiplier * Y[-1], future_x_contract.multiplier * X[-1]) context.long_term_weights[ future_y_contract.root_symbol] = -y_target_pct context.long_term_weights[ future_x_contract.root_symbol] = x_target_pct context.current_weights[ future_y_contract] = context.long_term_weights[ future_y_contract.root_symbol] context.current_weights[ future_x_contract] = context.long_term_weights[ future_x_contract.root_symbol] continue adjusted_weights = pd.Series({ k: v / (len(context.futures_pairs)) for k, v in context.current_weights.items() }) order_optimal_portfolio( opt.TargetWeights(adjusted_weights), constraints=[ opt.MaxGrossExposure(1.0), ], ) log.info('weights: ', adjusted_weights)
puntuacion = ((prices[etfs[1]] - prices[etfs[0]]) - (prices[etfs[1]] - prices[etfs[0]]).mean()) / np.std((prices[etfs[1]] - prices[etfs[0]])) puntuacion.plot() plt.axhline(puntuacion.mean()) plt.axhline(1.0, color='red') plt.axhline(-1.0, color='green') plt.title('Zscore of the spread') plt.show() prices_train = prices.loc[start_train_date:start_date] prices_train = prices_train.drop(prices_train.index[-1], axis = 0) prices_test = prices.loc[start_date:end_date] visual_coint(prices_train[etfs[0]],prices_train[etfs[1]]) visual_coint(prices_test[etfs[0]],prices_test[etfs[1]]) result = ts.coint(prices_train[etfs[0]],prices_train[etfs[1]]) # get conintegration print(result) pvalue = result[1] # get the pvalue print(pvalue) ganado = 0 for exito in exitos: ganado += (exito[2]-1) print('Dinero ganado: ', 100000*ganado) perdido = 0 for pifia in fracasos: perdido += (pifia[2]-1) print('Dinero perdido: ', 100000*perdido) print('Profit: ' , 100000*ganado + 100000*perdido)
def __init__(self): #self.coint_t = coint(self.y1, self.y2, trend="c")[0] self.coint_t = coint(self.y1, self.y2, trend="c", maxlag=0, autolag=None)[0] self.teststat = -1.8208817 self.teststat = -1.830170986148
def test_coint(): nobs = 200 scale_e = 1 const = [1, 0, 0.5, 0] np.random.seed(123) unit = np.random.randn(nobs).cumsum() y = scale_e * np.random.randn(nobs, 4) y[:, :2] += unit[:, None] y += const y = np.round(y, 4) for trend in []: #['c', 'ct', 'ctt', 'nc']: print('\n', trend) print(coint(y[:, 0], y[:, 1], trend=trend, maxlag=4, autolag=None)) print(coint(y[:, 0], y[:, 1:3], trend=trend, maxlag=4, autolag=None)) print(coint(y[:, 0], y[:, 2:], trend=trend, maxlag=4, autolag=None)) print(coint(y[:, 0], y[:, 1:], trend=trend, maxlag=4, autolag=None)) # results from Stata egranger res_egranger = {} # trend = 'ct' res = res_egranger['ct'] = {} res[0] = [ -5.615251442239, -4.406102369132, -3.82866685109, -3.532082997903 ] res[1] = [ -5.63591313706, -4.758609717199, -4.179130554708, -3.880909696863 ] res[2] = [ -2.892029275027, -4.758609717199, -4.179130554708, -3.880909696863 ] res[3] = [-5.626932544079, -5.08363327039, -4.502469783057, -4.2031051091] # trend = 'c' res = res_egranger['c'] = {} # first critical value res[0][1] has a discrepancy starting at 4th decimal res[0] = [ -5.760696844656, -3.952043522638, -3.367006313729, -3.065831247948 ] # manually adjusted to have higher precision as in other cases res[0][1] = -3.952321293401682 res[1] = [ -5.781087068772, -4.367111915942, -3.783961136005, -3.483501524709 ] res[2] = [ -2.477444137366, -4.367111915942, -3.783961136005, -3.483501524709 ] res[3] = [ -5.778205811661, -4.735249216434, -4.152738973763, -3.852480848968 ] # trend = 'ctt' res = res_egranger['ctt'] = {} res[0] = [ -5.644431269946, -4.796038299708, -4.221469431008, -3.926472577178 ] res[1] = [-5.665691609506, -5.111158174219, -4.53317278104, -4.23601008516] res[2] = [-3.161462374828, -5.111158174219, -4.53317278104, -4.23601008516] res[3] = [ -5.657904558563, -5.406880189412, -4.826111619543, -4.527090164875 ] # The following for 'nc' are only regression test numbers # trend = 'nc' not allowed in egranger # trend = 'nc' res = res_egranger['nc'] = {} nan = np.nan # shortcut for table res[0] = [-3.7146175989071137, nan, nan, nan] res[1] = [-3.8199323012888384, nan, nan, nan] res[2] = [-1.6865000791270679, nan, nan, nan] res[3] = [-3.7991270451873675, nan, nan, nan] for trend in ['c', 'ct', 'ctt', 'nc']: res1 = {} res1[0] = coint(y[:, 0], y[:, 1], trend=trend, maxlag=4, autolag=None) res1[1] = coint(y[:, 0], y[:, 1:3], trend=trend, maxlag=4, autolag=None) res1[2] = coint(y[:, 0], y[:, 2:], trend=trend, maxlag=4, autolag=None) res1[3] = coint(y[:, 0], y[:, 1:], trend=trend, maxlag=4, autolag=None) for i in range(4): res = res_egranger[trend] assert_allclose(res1[i][0], res[i][0], rtol=1e-11) r2 = res[i][1:] r1 = res1[i][2] assert_allclose(r1, r2, rtol=0, atol=6e-7)
# In[305]: (Y-X).plot() # Plot the spread plt.axhline((Y-X).mean(), color='red', linestyle='--') # Add the mean # ##Testing for Cointegration # # That's an intuitive definition, but how do we test for this statisitcally? There is a convenient test that lives in `statsmodels.tsa.stattools`. We should see a very low p-value, as we've artifically created two series that are as cointegrated as physically possible. # In[306]: # compute the p-value of the cointegration test # will inform us as to whether the spread btwn the 2 timeseries is stationary # around its mean score, pvalue, _ = coint(X,Y) print pvalue # ### Correlation vs. Cointegration # # Correlation and cointegration, while theoretically similar, are not the same. To demonstrate this, we'll show examples of series that are correlated, but not cointegrated, and vice versa. To start let's check the correlation of the series we just generated. # In[307]: X.corr(Y) # That's very high, as we would expect. But how would two series that are correlated but not cointegrated look? # # ###Correlation Without Cointegration