Esempio n. 1
0
def gen_randdata(n=2):
  # Generate random data for correlation and cointegration tests.
  # First, cointegrated and correlated
  X_returns = np.random.normal(0, 1, 100) # Generate the daily returns
  # sum them and shift all the prices up into a reasonable range
  X = pd.Series(np.cumsum(X_returns), name='X') + 50
  some_noise = np.random.normal(0, 1, 100)
  Y = X + 5 + some_noise
  Y.name = 'Y'
  fig = plt.figure()
  ax1 = fig.add_subplot(321)
  ax1.plot(pd.concat([X, Y], axis=1))
  ax2 = fig.add_subplot(322)
  ax2.plot(Y-X)
  ax2.axhline((Y-X).mean(), color='red', linestyle='--')
  score, pvalue, _ = coint(X,Y)
  ax1.set_title('Correlation: %.5f' %X.corr(Y))
  ax2.set_title('Cointegration P: %.5f' %pvalue)
  # Next, correlated but not cointegrated
  X_returns = np.random.normal(1, 1, 100)
  Y_returns = np.random.normal(2, 1, 100)
  X_diverging = pd.Series(np.cumsum(X_returns), name='X_diverging')
  Y_diverging = pd.Series(np.cumsum(Y_returns), name='Y_diverging')
  ax3 = fig.add_subplot(323)
  ax4 = fig.add_subplot(324)
  ax3.plot(pd.concat([X_diverging, Y_diverging], axis=1))
  ax4.plot(Y_diverging-X_diverging)
  ax4.axhline((Y_diverging-X_diverging).mean(), color='red', linestyle='--')
  score, pvalue, _ = coint(X_diverging, Y_diverging)
  ax3.set_title('Correlation: %.5f' %X_diverging.corr(Y_diverging))
  ax4.set_title('Cointegration P: %.5f' %pvalue)
  # cointegration without correlation ("nominal convergence")
  X_coint = pd.Series(np.random.normal(0, 1, 1000), name='X_coint') + 20
  Y_coint = X_coint.copy()
  for i in range(10):
    if i % 2 == 0:
      Y_coint[i*100:(i+1)*100] = 10
    else:
      Y_coint[i*100:(i+1)*100] = 30
  ax5 = fig.add_subplot(325)
  ax5.plot(X_coint)
  ax5.plot(Y_coint)
  ax5.set_ylim([0,40])
  ax6 = fig.add_subplot(326)
  ax6.plot(Y_coint-X_coint)
  ax6.axhline((Y_coint-X_coint).mean(), color='red', linestyle='--')
  score, pvalue, _ = coint(X_coint, Y_coint)
  ax5.set_title('Correlation: %.5f' %X_coint.corr(Y_coint))
  ax6.set_title('Cointegration P: %.5f' %pvalue)
  plt.show()
  return
Esempio n. 2
0
def simple_comparison(X,Y, show=True):
  """
  Compares correlation and cointegration for X and Y. Can be Y! dataframe
  or n x 1 arrays.
  """
  if type(X) is pd.DataFrame:
    x = X['Adj Close']
  else:
    x = X
  if type(Y) is pd.DataFrame:
    y = Y['Adj Close']
  else:
    y = Y
  #x = pd.Series(np.cumsum(x), name='x')
  #y = pd.Series(np.cumsum(y), name='y')
  x = pd.Series(x, name='x')
  y = pd.Series(y, name='y')
  x, y = return_timelocked(x,y)
  fig = plt.figure()
  ax1 = fig.add_subplot(121)
  ax1.plot(pd.concat([x,y], axis=1))
  ax2 = fig.add_subplot(122)
  ax2.plot(y-x)
  ax2.axhline((y-x).mean(), color='red', linestyle='--')
  score, pvalue, _ = coint(x,y)
  ax1.set_title('Correlation: %.5f' %x.corr(y))
  ax2.set_title('Cointegration P: %.5f' %pvalue)
  if show:
    plt.show()
    return
  return ax1, ax2
def test_coint(pair):
    # Using Augmented Dickey-Fuller unit root test (from Brett's code)
    #result = sm.OLS(pair[1], pair[0]).fit()
    #dfResult =  ts.adfuller(result.resid)
    #return dfResult[0] >= dfResult[4]['10%']

    # Using cointegration tets built into statsmodels
    result = ts.coint(pair[1], pair[0])
    return result[0] >= result[2][2]
Esempio n. 4
0
def test_coint_identical_series():
    nobs = 200
    scale_e = 1
    np.random.seed(123)
    y = scale_e * np.random.randn(nobs)
    warnings.simplefilter('always', CollinearityWarning)
    with pytest.warns(CollinearityWarning):
        c = coint(y, y, trend="c", maxlag=0, autolag=None)
    assert_equal(c[1], 0.0)
    assert_(np.isneginf(c[0]))
Esempio n. 5
0
def test_coint_identical_series():
    nobs = 200
    scale_e = 1
    np.random.seed(123)
    y = scale_e * np.random.randn(nobs)
    warnings.simplefilter('always', ColinearityWarning)
    with warnings.catch_warnings(record=True) as w:
        c = coint(y, y, trend="c", maxlag=0, autolag=None)
    assert_equal(len(w), 1)
    assert_equal(c[1], 0.0)
    assert_(np.isneginf(c[0]))
Esempio n. 6
0
def test_coint_perfect_collinearity():
    nobs = 200
    scale_e = 1
    np.random.seed(123)
    x = scale_e * np.random.randn(nobs, 2)
    y = 1 + x.sum(axis=1)
    warnings.simplefilter('always', ColinearityWarning)
    with warnings.catch_warnings(record=True) as w:
        c = coint(y, x, trend="c", maxlag=0, autolag=None)
    assert_equal(c[0], 0.0)
    # Limit of table
    assert_(c[1] > .98)
def test_coint_perfect_collinearity():
    # test uses nearly perfect collinearity
    nobs = 200
    scale_e = 1
    np.random.seed(123)
    x = scale_e * np.random.randn(nobs, 2)
    y = 1 + x.sum(axis=1) + 1e-7 * np.random.randn(nobs)
    warnings.simplefilter('always', CollinearityWarning)
    with warnings.catch_warnings(record=True) as w:
        c = coint(y, x, trend="c", maxlag=0, autolag=None)
    assert_equal(c[1], 0.0)
    assert_(np.isneginf(c[0]))
Esempio n. 8
0
def get_pairs(train):
    s = train.corr().abs().unstack().drop_duplicates()
    corr = s.sort_values()
    corr.to_csv('~/Desktop/MAFN/Hedge Fund/presentation2/correlation.csv')
    pairs = list(corr.index)[-200:-1]
    ans = []
    for pair in pairs:
        _, p_value, _ = coint(df[[pair[0]]], df[[pair[1]]])
        if p_value < 0.05:
            ans.append(pair)
    selected = corr[ans]
    selected.to_csv('highcorrelationandcointegrated.csv', header = None)
    return list(selected.index)
def find_cointegrated_pairs(securities_panel):
    n = len(securities_panel.minor_axis)
    score_matrix = np.zeros((n, n))
    pvalue_matrix = np.ones((n, n))
    keys = securities_panel.keys
    pairs = []
    for i in range(n):
        for j in range(i+1, n):
            S1 = securities_panel.minor_xs(securities_panel.minor_axis[i])
            S2 = securities_panel.minor_xs(securities_panel.minor_axis[j])
            result = coint(S1, S2)
            score = result[0]
            pvalue = result[1]
            score_matrix[i, j] = score
            pvalue_matrix[i, j] = pvalue
            if pvalue < 0.05:
                pairs.append((securities_panel.minor_axis[i], securities_panel.minor_axis[j]))
    return score_matrix, pvalue_matrix, pairs
Esempio n. 10
0
def find_cointegrated_pairs(stock_data,
                            significance=0.05):  # ------------------
    # This function is from,
    # https://www.quantopian.com/lectures/introduction-to-pairs-trading.
    # We also sort the pairs in ascending order by p-value
    n = stock_data.shape[1]
    score_matrix = np.zeros((n, n))
    pvalue_matrix = np.ones((n, n))
    keys = stock_data.keys()
    pairs = []
    for i in range(n):
        for j in range(i + 1, n):
            S1 = stock_data[keys[i]]
            S2 = stock_data[keys[j]]
            result = stattools.coint(S1, S2)
            score = result[0]
            pvalue = result[1]
            score_matrix[i, j] = score
            pvalue_matrix[i, j] = pvalue
            if pvalue < significance:
                pairs.append({(keys[i], keys[j]): pvalue})
    return score_matrix, pvalue_matrix, pairs
Esempio n. 11
0
    def calibrate(self, prices):

        # identify valid pairs
        possible_pairs = self.get_possible_pairs(prices)
        valid, betas = [], []
        for pair in possible_pairs:
            
            # Engle-Granger Test
            log_1, log_2 = np.log(prices[list(pair)])._series.values()
            if ts.coint(log_1, log_2)[1] > 0.05: continue

            # Augmented Dickey-Fuller Test
            alpha, beta = cointegrate(log_1, log_2)
            spread = log_1 - beta * log_2 - alpha
            if ts.adfuller(spread)[1] > 0.05: continue

            # Mean Reversion Test
            k = get_reversion_speed(spread)
            if k < 0.2: continue
            valid.append(pair)
        
        # remove invalid live pairs
        invalid = set(self.pairs.keys()) - set(valid)
        orders = []
        for pair in invalid:
            positions = np.array(self.pairs[pair]['positions'])
            if np.absolute(positions).sum() > 0: orders += list(zip(pair, positions * -1))
            del self.pairs[pair]

        # add new valid pairs
        for pair in valid:
            if pair not in self.pairs:
                training_data = np.log(prices[list(pair)].values[:-1])
                self.pairs[pair] = {
                    'positions': [0, 0],
                    'state': 'neutral',
                    'filter': Kalman_Filter(training_data)
                }
        return orders
Esempio n. 12
0
def find_all_pairs(days: int, data_dir: str='StockData', corr_value=.9, p_value=.05, candle_type: str='close'):

	data_dir = Path.joinpath(Path.cwd(), data_dir)
	data_file_names = ticker_list(data_dir)
	data_file_names = data_file_names[:45]  # to not test 124000 possibilities in ~500 stock list
	cointegrated = []

	for i in range(len(data_file_names)):
		for j in range(i + 1, len(data_file_names)):
			data_list_1 = ranged_price_list(data_file_names[i], days, candle_type)
			data_list_2 = ranged_price_list(data_file_names[j], days, candle_type)

			# correlation test
			if corr_value < pearson_coor(data_list_1, data_list_2):
				coint_value = coint(data_list_1, data_list_2)[1]

				# cointegrated test
				if coint_value < p_value:
					cointegrated.append(
						(coint_value, data_file_names[i].name, data_file_names[j].name))

	return cointegrated
Esempio n. 13
0
def EG_coin_result(a, b):
    X = table1.loc[a][:2016]
    Y = table1.loc[b][:2016]
    #the data is from 2005 to 2015, 2517 days, there are 252 trading days in a year,
    #define the first 8 years as training period, 2016 days
    #start = datetime.datetime(2005, 1, 1)
    #end = datetime.datetime(2009, 12, 31)
    #mask = (data['date2'] > start) & (data['date2'] <= end)
    #X=X.loc[mask]
    #Y=Y.loc[mask]
    #result = pd.merge(X, Y, on='date2')
    #x1=result['prc_x'] #closing price
    #y1=result['prc_y'] #closing price
    '''
    #calculate price with 100
    newprc_x=pd.Series()
    newprc_x.set_value(result['date2'][0],100)
    for i in range(1,x1.size):
        newprc_x.set_value(result['date2'][i],newprc_x[i-1]*(result['ret_x'][i-1]+1))
    newprc_y=pd.Series(index=result['date2'])
    newprc_y.set_value(result['date2'][0],100)
    for i in range(1,y1.size):
        newprc_y.set_value(result['date2'][i],newprc_y[i-1]*(result['ret_y'][i-1]+1))
    '''
    #print ("Test 1: Engle Granger")
    try:
        coin_result = ts.coint(X, Y)
        #print("p-value: ",coin_result[1])
        #print('pair: ',a,'&',b)
        if (coin_result[1] < 0.05 and not coin_result[1] == 0):
            #print("passed")
            passed = True
            #print("p value:",coin_result[1])
        else:
            #print("not passed")
            passed = False
    except:
        passed = False
    return passed
Esempio n. 14
0
def perform_coint(start_pointer, end_pointer):
    stock_series = get_data(start_pointer, end_pointer)
    p_values = []
    coint_sec = []

    tickers = [
        'XOM', 'RDS.A', 'CVX', 'TOT', 'BP', 'PTR', 'SNP', 'SLB', 'EPD', 'E',
        'COP', 'EQNR', 'EOG', 'PBR', 'CEO', 'SU', 'OXY', 'HAL'
    ]
    for i in range(len(tickers)):
        p_values.append([0] * len(tickers))
        coint_sec.append([0] * len(tickers))

    for i in range(0, len(tickers)):
        for j in range(0, len(tickers)):
            if (i < j):
                p_values[i][j] = (ts.coint(stock_series[tickers[i]],
                                           stock_series[tickers[j]]))[1]
            else:
                p_values[i][j] = 0.5

    threshold = 0.005

    pair_header = ['Stock 1', 'Stock 2']
    pairs = []
    pairs.append(pair_header)

    for i in range(0, len(tickers)):
        for j in range(0, len(tickers)):
            if (i < j):
                if (p_values[i][j] < threshold):
                    coint_sec[i][j] = 1
                    pairs.append([tickers[i], tickers[j]])
                else:
                    coint_sec[i][j] = 0
            else:
                p_values[i][j] = 0
    return tickers, pairs, p_values, coint_sec
Esempio n. 15
0
def Analyze_Data( data):
       
    # check for cointegration
    score, pvalue, _ = coint(data.x1, data.x2) 
        
    # test for stationary
    stationary, station_pvalue = check_for_stationarity(data.Z)
    
    data.stationary = stationary
    data.coi_pvalue = pvalue
    data.stn_pvalue = station_pvalue
        
    num = '{:2.3f}'

    if pvalue < 0.05:
#    if pvalue < 0.05 and stationary and data.b > 0 and data.b < 3:
#    if pvalue < 0.05 and data.b > 0 and stationary:
        
        data.trade_signal = True
        data.x1_symbol = symbols[1][data.i1]
        data.x2_symbol = symbols[1][data.i2]
        
        if zscore(data.Z).iloc[-1] > 0:
            data.x1_signal = trade_type.BUY
            data.x2_signal = trade_type.SELL
        elif zscore(data.Z).iloc[-1] < 0:
            data.x1_signal = trade_type.SELL
            data.x2_signal = trade_type.BUY
            
        text1 = 'Cointegration between '+ data.x1_symbol + 'and', data.x2_symbol + 'with p-value =', num.format(data.coi_pvalue)
        text2 = 'Beta (b) is '+ num.format(data.b)
        text3 = 'Spread is stationary with pvalue '+ num.format(data.stn_pvalue)
        text4 = 'spread max = '+ num.format(zscore(data.Z).max())
        text5 = 'spread min = '+ num.format(zscore(data.Z).min())
        text6 = 'current spread value ='+ num.format(zscore(data.Z).iloc[-1])            
        print(text1,'\n',text2,'\n',text3,'\n',text4,'\n',text5,'\n',text6)
                        
    return data
Esempio n. 16
0
def get_conint_params_2(ts1, ts2, window=30):

    if len(ts1) != len(ts2):
        return False
    else:
        df = pd.DataFrame([])

    df['ts_1'] = ts1
    df['ts_2'] = ts2
    # Calculate optimal hedge ratio "beta"
    df = df.fillna(method='ffill').dropna()
    df['ratio'] = [y / x for x, y in zip(ts1, ts2)]

    score, pvalue, _ = coint(ts1, ts2)
    #compute 21-minute rolling residual
    res_norm = zscore(df['ratio'], window).rolling(5).median()
    print("P: " + str(pvalue))
    #if pvalue < 0.3:#cadf[1] <= 0.1 or
    plt.plot(res_norm)
    plt.show()
    plt.plot(df['ratio'])
    plt.show()
    return pvalue
    def perform_cointegration(self, company1: str,
                              company2: str) -> CointegrationResult:
        """
        Perform cointegration

        :param company1: company 1 in the list of companies
        :type company1: str
        :param company2: company 1 in the list of companies
        :type company2: str
        :return: t-statistic of unit-root test on residuals
        :rtype: float
        :return: p-value
        :rtype: float
        :return: Critical values for the test statistic at the 1%, 5%, and 10% levels
        :rtype: np.array
        """
        if company1 not in self.companies or company2 not in self.companies:
            raise ValueError(
                'Input companies: {}, {} must be in the list of companies used to construct '
                'with: {}'.format(company1, company2, self.companies))
        coint_t, pvalue, crit = coint(self.closing_prices[company1],
                                      self.closing_prices[company2])
        return CointegrationResult(coint_t=coint_t, pvalue=pvalue, crit=crit)
def plotpairs(clA, clB):
    clA, clB = loaddata()
    # 绘图展示
    plt.figure(1)
    plt.plot(clA, label='EWA', color='b')
    plt.plot(clB, label='EWC', color='g')
    plt.legend()

    plt.figure(2)
    plt.scatter(clA, clB)
    ## 线性回归
    clA = sm.add_constant(clA)
    regression_result = sm.OLS(clB, clA).fit()
    hedgeRatio = regression_result.params[1]
    ## 确定对冲比例
    # 先还原clA
    clA = clA[:, 1]
    clC = clB - hedgeRatio * clA
    plt.figure(3)
    plt.plot(clC)
    ## 检验协整
    results = coint(clA, clB)
    """
Esempio n. 19
0
def find_cointegrated_pairs(df_array): # Takes in an array of dataframes, outputs cointegrated pairs
    for df in df_array:
        n = df.shape[1]
        score_matrix = np.zeros((n, n))
        pvalue_matrix = np.ones((n, n))
        keys = df.keys()
        pairs = []
        total_stocks = n
        total_tests = ((total_stocks)*(total_stocks - 1))/2 # n choose 2 total combinations
    for i in range(n):
        for j in range(i+1, n):
            S1 = df[keys[i]]
            S2 = df[keys[j]]
            result = ts.coint(S1, S2)
            score = result[0]
            pvalue = result[1]
            score_matrix[i, j] = score
            pvalue_matrix[i, j] = pvalue
            if pvalue < (0.05): # Applying Bonferroni correction 
                pairs.append((keys[i], keys[j]))
    total_stocks = 0
    # print pairs
    return pairs
Esempio n. 20
0
def find_cointegrated_pairs(securities_panel):
    '''
    Function to find out best pairs w.r.t co-integration
    :param securities_panel: panel with closing prices per security
    :return: metrix with pvalue of co-integration
    '''
    n = len(securities_panel.columns)
    score_matrix = np.zeros((n, n))
    pvalue_matrix = np.ones((n, n))
    keys = securities_panel.keys
    pairs = []
    for i in range(n):
        for j in range(i+1, n):
            S1 = securities_panel.iloc[:,i]
            S2 = securities_panel.iloc[:,j]
            result = coint(S1, S2)
            score = result[0]
            pvalue = result[1]
            score_matrix[i, j] = score
            pvalue_matrix[i, j] = pvalue
            if pvalue < 0.01:
                pairs.append((securities_panel.columns[i], securities_panel.columns[j],pvalue))
    return score_matrix, pvalue_matrix, pairs
Esempio n. 21
0
def find_cointegrated_pairs(data: pd.DataFrame) -> Tuple[np.ndarray, np.ndarray, List[Tuple]]:
    """
    This function will look for pairs of cointegrated stocks.

    :param data: pd.DataFrame
    :return:
    """
    n = data.shape[1]
    score_matrix = np.zeros((n, n))
    pvalue_matrix = np.ones((n, n))
    keys = data.keys()

    # We store the stock pairs that are likely to be cointegrated
    pairs = []

    progress_bar = tqdm(total=n**2/2, desc='Running cointegration tests...')

    for i in range(n):

        for j in range(i+1, n):

            S1: pd.Series = data[keys[i]]                           # values from first column
            S2: pd.Series = data[keys[j]]                           # values from second column
            result: Tuple = coint(S1, S2)                           # level of cointegration
            score: float = result[0]                                # t-score
            pvalue: float = result[1]                               # p-value
            score_matrix[i, j] = score                              # add coint scores to score_matrix
            pvalue_matrix[i, j] = pvalue                            # add pvalues to pvalue_matrix

            # if the p-value is less than the tresh, append to list
            if pvalue < 0.02:
                pairs.append((keys[i], keys[j]))

            progress_bar.update(1)

    progress_bar.close()
    return score_matrix, pvalue_matrix, pairs
Esempio n. 22
0
def main():
    # set starting variables (only need to manuably set tickers)
    endDate = date.today()
    startDate = endDate - datetime.timedelta(days=5 * 365)
    tickers = ['SMH', 'ARKK', 'XLK', 'QQQ', 'AAPL', 'MSFT',
               'TSLA', 'ORCL', 'QCOM', 'AMD', 'UBER', 'SQ']

    # get data for each ticker
    data = pdr.get_data_yahoo(tickers, start=startDate, end=endDate)
    prices = data["Adj Close"].dropna(axis='columns')

    # set up data for test
    keysList = prices.keys()
    pValMax = 0.2
    pairsList = []

    print(f"\n{str(len(keysList))} tickers span a valid backtest with {int((len(keysList) * (len(keysList) - 1)) / 2)} possible pair(s).")

    # run cointegration test on all possible pairs
    for i in range(len(keysList)):
        for j in range(i + 1, len(keysList)):
            result = coint(keysList[i], keysList[j])
            pvalue = result[1]

            if(pvalue < pValMax):
                corr = np.corrcoef(keysList[i], keysList[j])
                pairsList.append(
                    (keysList[i], keysList[j], pvalue, corr[0][1]))

    pairsList = sorted(pairsList, key=itemgetter(3), reverse=True)
    print(f"{len(pairsList)} possible cointegrated pairs with p-values less than {str(pValMax)}:")

    # print out valid pairs with sufficient p-value
    for pair in pairsList:
        print(f"\n {pair[0]} and {pair[1]}:")
        print(f"p-value = {round(pair[2], 4)}")
        print(f"correlation coefficient = {round(pair[3], 4)}")
Esempio n. 23
0
def coin_test():
    start_date = 1580515200000  # 1 January 2018, 00:00:00
    end_date = 1585612800000  # 31 May 2018, 23:59:59
    assets = [
        'EOSUSD', 'BTCUSD', 'ETHUSD', 'LTCUSD', 'TRXUSD', 'NEOUSD', 'ETCUSD',
        'XLMUSD'
    ]

    crypto_prices = pd.DataFrame()

    print("\n\nStaring cointest function............\n")
    for a in assets:
        print('Downloading ' + a)
        crypto_prices[a] = get_bitfinex_asset(asset=a,
                                              ts_ms_start=start_date,
                                              ts_ms_end=end_date)

    crypto_prices.head()

    # Normalize prices by first value
    norm_prices = crypto_prices.divide(crypto_prices.iloc[0])
    print(f'\n\n\nPrinting norm prices....\n\n{norm_prices}\n\n')
    plt.figure(figsize=(15, 10))
    plt.plot(norm_prices)
    plt.xlabel('days')
    plt.title('Performance of cryptocurrencies')
    plt.legend(assets)
    plt.show()

    df_dic = {'asset pairs': [], 'test result': []}
    for a1 in crypto_prices.columns:
        for a2 in crypto_prices.columns:
            if a1 != a2:
                test_result = ts.coint(crypto_prices[a1], crypto_prices[a2])
                # print(a1 + ' and ' + a2 + ': p-value = ' + str(test_result[1]))
                df_dic['asset pairs'].append(a1 + ' and ' + a2)
                df_dic['test result'].append(test_result[1])
Esempio n. 24
0
def find_cointegrated_pairs(securities):
  # Quantopian function to compare pairs of securities; revised to use
  # pd.Series instead of pd.DataPanel
  if type(securities) is not pd.Series:
    print('type is %s but should be pd.Series' %type(securities))
    return
  n = len(securities.index)
  score_matrix = np.zeros((n, n))
  pvalue_matrix = np.ones((n, n))
  keys = securities.index
  pairs = []
  for i in range(n):
    for j in range(i+1, n):
      S1 = securities[i]
      S2 = securities[j]
      S1, S2 = return_timelocked(S1, S2)
      result = coint(S1, S2)
      score = result[0]
      pvalue = result[1]
      score_matrix[i, j] = score
      pvalue_matrix[i, j] = pvalue
      if pvalue < 0.05:
        pairs.append((keys[i], keys[j]))
  return score_matrix, pvalue_matrix, pairs
Esempio n. 25
0
def find_cointegrated_pairs(securities_panel):
    '''
    Function to find out best pairs w.r.t co-integration
    :param securities_panel: panel with closing prices per security
    :return: metrix with pvalue of co-integration
    '''
    n = len(securities_panel.columns)
    score_matrix = np.zeros((n, n))
    pvalue_matrix = np.ones((n, n))
    keys = securities_panel.keys
    pairs = []
    for i in range(n):
        for j in range(i + 1, n):
            S1 = securities_panel.iloc[:, i]
            S2 = securities_panel.iloc[:, j]
            result = coint(S1, S2)
            score = result[0]
            pvalue = result[1]
            score_matrix[i, j] = score
            pvalue_matrix[i, j] = pvalue
            if pvalue < 0.01:
                pairs.append((securities_panel.columns[i],
                              securities_panel.columns[j], pvalue))
    return score_matrix, pvalue_matrix, pairs
def find_cointegrated_pairs(data):
    import numpy as np
    import pandas as pd
    import statsmodels
    import statsmodels.api as sm
    from statsmodels.tsa.stattools import coint

    n = data.shape[1]
    score_matrix = np.zeros((n, n))
    pvalue_matrix = np.ones((n, n))
    keys = data.keys()
    pairs = []
    for i in range(n):
        for j in range(i + 1, n):
            S1 = data[keys[i]]
            S2 = data[keys[j]]
            result = coint(S1, S2)
            score = result[0]
            pvalue = result[1]
            score_matrix[i, j] = score
            pvalue_matrix[i, j] = pvalue
            if pvalue < 0.05:
                pairs.append((keys[i], keys[j]))
    return score_matrix, pvalue_matrix, pairs
Esempio n. 27
0
def find_cointegrated_pairs(securities_panel):
    n = len(securities_panel.minor_axis)
    score_matrix = np.zeros((n, n))
    pvalue_matrix = np.ones((n, n))
    keys = securities_panel.keys
    pairs = []

    # Cycles through all combinations of two securities and checks cointegration on each
    for i in range(n):
        for j in range(i + 1, n):
            S1 = securities_panel.minor_xs(securities_panel.minor_axis[i])
            S2 = securities_panel.minor_xs(securities_panel.minor_axis[j])
            result = coint(S1, S2)
            score = result[0]
            pvalue = result[1]
            score_matrix[i, j] = score
            pvalue_matrix[i, j] = pvalue

            # Returns statistically significant pairs
            if pvalue < 0.05:
                pairs.append((securities_panel.minor_axis[i],
                              securities_panel.minor_axis[j]))

    return score_matrix, pvalue_matrix, pairs
stock_prices.drop('date', axis = 1, inplace = True)

stock_prices = stock_prices.iloc[::-1]
stock_returns = stock_prices.pct_change()
stock_returns.drop('2013-07-31',axis=0,inplace = True) #calculate stocks' daily returns

#stock_returns = stock_returns.interpolate().dropna(axis=1)
stock_prices = stock_prices.interpolate().dropna(axis=1)

stock_list = list(stock_prices.columns.values)
pairs = [] #define pairs as list

#test cointegration of the stocks
for i in range(0,len(stock_list),1):
    for j in range(i+1, len(stock_list),1):
        results = coint(stock_prices.iloc[:,i],stock_prices.iloc[:,j])
        if results[1]< 0.01: #get the pvalues
            pairs.append([stock_list[i],stock_list[j],results[1]])

pairs_selected = []

#test difference of stock prices using ADF test
for i in range(0,len(pairs),1):
    results = sts.adfuller(stock_prices[pairs[i][0]]-stock_prices[pairs[i][1]],1)
    if results[1] < 0.05:
        pairs_selected.append(pairs[i])

#get the smallest p-value pair in 'pairs_selected'
pairs_selected.sort(key=itemgetter(2)) #ascending order thus the first two stocks are chosen

stock_1 = pairs_selected[0][0]  #China State Ship Building
Esempio n. 29
0
def do_coint_tests(
        dataframe: pd.DataFrame,
        pvalue_threshold: float = 0.04,
        coint_dir: Path=Path('tmp')) -> Tuple[pd.DataFrame, pd.DataFrame, List]:
    """
    Perform coint test between all series.
    Also writes the nested dictionaries to file.

    By default, the function will write the nested dictionaries to:
        - scores.json
        - pvalue.json

    :param dataframe: pandas DataFrame, where each series is a stock
    :param pvalue_threshold: float value representing threshold for pvalue to be considered cointegrated
    :return: Tuple of pandas DataFrames --> (df_scores, df_pvalue)
    """

    # big dictionary to keep track of values
    scores_dict: Dict[str, Dict[str, float]] = {}
    pvalue_dict: Dict[str, Dict[str, float]] = {}
    pairs: List[Tuple[str, str]] = []

    # iter through columns
    num_columns = len(dataframe.columns)
    pbar = tqdm(total=int(num_columns**2/2)-int(num_columns/2), desc='Performing Cointegration tests between columns')

    for i, (name1, s1) in enumerate(dataframe.iteritems()):

        for j, (name2, s2) in enumerate(dataframe.iloc[:, i+1:].iteritems()):

            # get coint results between two columns
            score, pvalue, _ = coint(s1, s2)

            # add scores and pvalues to dictionaries
            update_coint_dict(scores_dict, score, name1, name2)
            update_coint_dict(pvalue_dict, pvalue, name1, name2)

            # if the p-value is less than the tresh, append to list
            if pvalue < pvalue_threshold:
                pairs.append((name1, name2))

            pbar.update(1)

    coint_dir.resolve()
    coint_dir.mkdir(parents=True, exist_ok=True)

    with (coint_dir / 'scores.json').open('w') as f:
        json.dump(scores_dict, f, indent=4)

    # with open('scores.json', 'w') as f:
    #     json.dump(scores_dict, f, indent=4)

    with (coint_dir / 'pvalue.json').open('w') as f:
        json.dump(scores_dict, f, indent=4)

    # with open('pvalue.json', 'w') as f:
    #     json.dump(pvalue_dict, f, indent=4)

    with (coint_dir / 'pairs.txt').open('w') as f:
        for name1, name2 in pairs:
            f.write('{},{}\n'.format(name1, name2))

    # with open('pairs.txt', 'w') as f:
    #     for name1, name2 in pairs:
    #         f.write('{},{}\n'.format(name1, name2))

    df_scores = dataframe_from_coint_dict(scores_dict)
    df_pvalue = dataframe_from_coint_dict(pvalue_dict)

    return df_scores, df_pvalue, pairs
Esempio n. 30
0
if analysis_type < 2:
    # Calculate and plot price correlations.
    pearson_corr = df[tickers].corr()
    sns.clustermap(pearson_corr).fig.suptitle('Pearson Correlations')
    if analysis_type == 1:
        plt.show()

if analysis_type != 1:
    # Plot the marginal distributions.
    sns.set(style='darkgrid')
    sns.jointplot(df[pair_1], df[pair_2], kind='hex', color='#2874A6')

    # Calculate the p-value of cointegration test.
    x = df[pair_1]
    y = df[pair_2]
    _, p_value, _ = coint(x, y)
    print('The p_value of pair cointegration is: {}'.format(p_value))

    # Plot the linear relationship of the EURJPY-GBPJPY pair.
    df2 = df[[pair_1, pair_2]].copy()
    spread = df2[pair_1] - df2[pair_2]
    mean_spread = spread.mean()
    df2['Dev'] = spread - mean_spread
    rnd = np.random.choice(len(df), size=500)
    sns.scatterplot(x=pair_1,
                    y=pair_2,
                    hue='Dev',
                    linewidth=0.3,
                    alpha=0.8,
                    data=df2.iloc[rnd, :]).set_title(
                        '%s-%s Price Relationship' % (pair_1, pair_2))
Esempio n. 31
0
# Plot the BIC as a function of p
plt.plot(range(1, 7), BIC[1:7], marker='o')
plt.xlabel('Order of AR Model')
plt.ylabel('Baysian Information Criterion')
plt.show()



###### COINTEGRATION
from statsmodels.tsa.stattools import coint, adfuller
from statsmodels.api import OLS, add_constant

P = df5['Adj Close_AMZN_Actual']
Q = df5['Adj Close_MSFT_Actual']
coint(P,Q)

ols = OLS(Q, add_constant(P)).fit()
p_val = ols.params[1]
ad_test = adfuller(Q-p_val*P)










Esempio n. 32
0
 def get_p_value(x, y):
     _, p_val, _ = coint(x, y)
     return p_val
Esempio n. 33
0
plt.axhline((Y/X).mean(), color='red', linestyle='--')

plt.xlabel('Time')
plt.legend(['Price Ratio', 'Mean'])
plt.show()

"""Here is a plot of the ratio between the two two series. Notice how it tends to revert back to the mean? This is a clear sign of cointegration.

## Cointegration Test

You now know what it means for two stocks to be cointegrated, but how do we actually quantify and test for cointegration?

The module statsmodels has a good cointegration test that outputs a t-score and a p-value. It's a lot of statistical mumbo-jumbo that shows us the probability that we get a certain value given the distribution. In the end, we want to see a low p-value, ideally less than 5%, to give us a clear indicator that the pair of stocks are very likely to be cointegrated.
"""

score, pvalue, _ = coint(X,Y)
print(pvalue)

# Low pvalue means high cointegration!

"""### Clarification
In case you are a bit on the ropes regarding the difference between correlation and cointegration, let me show you some pictures that will make the distinction between correlation and cointegration clear.
"""

ret1 = np.random.normal(1, 1, 100)
ret2 = np.random.normal(2, 1, 100)

s1 = pd.Series(np.cumsum(ret1), name='X_divering')
s2 = pd.Series(np.cumsum(ret2), name='Y_diverging')

pd.concat([s1, s2], axis=1).plot(figsize=(15, 7))
 def testForCointegrationJohansen(self, series1,series2):
     a = coint(series1,series2, "ct")
     
     return a
MR_one = mean_revert_FX_one.mean_revert_algo(eurusd,gbpusd,'EUR','UK','US')
eurusd_gbpusd = MR_one.merging_func()

fig, ax1 = plt.subplots()
x=eurusd_gbpusd['Date']
y1=eurusd_gbpusd['Price_x']
y2=eurusd_gbpusd['Price_y']
ax2 = ax1.twinx()
ax1.plot(x, y1, 'g-')
ax1.plot(x, y2, 'b-')

eurusd_gbpusd.Price_x = pd.to_numeric(eurusd_gbpusd.Price_x)
eurusd_gbpusd.Price_y = pd.to_numeric(eurusd_gbpusd.Price_y)

ts.adfuller(eurusd_gbpusd['Price_y'])
result = ts.coint(eurusd_gbpusd['Price_x'],eurusd_gbpusd['Price_y'])

eurusd_gbpusd_test = eurusd_gbpusd[['Price_x','Price_y']]

coint_johansen(eurusd_gbpusd_test, -1, 1)

### Analyzing spreads and ratios ###

eurusd_gbpusd['Price_diff'] = eurusd_gbpusd.Price_x - eurusd_gbpusd.Price_y
eurusd_gbpusd['Log_ret'] = np.log(eurusd_gbpusd.Price_x/eurusd_gbpusd.Price_y)
eurusd_gbpusd['ret'] = eurusd_gbpusd.Price_x/eurusd_gbpusd.Price_y

eurusd_gbpusd.plot('Date','ret')
plt.show()

########################################
Esempio n. 36
0
def coint_similar(symbol, sc=slice(0, 2), show=True):
    """
        TODO: 稳定后 slice赋值给变量,sc直接=相对应的变量
    """
    pd_list = get_pdlist(sc)
    sum_rank = get_sum_rank(pd_list, symbol)

    rank_head = sum_rank.sort_values(ascending=True)[1:100]

    kl_pd = SymbolPd.make_kfold_pd(symbol, n_folds=1)
    mul_pd = SymbolPd.make_kfold_mulpd(rank_head.index.tolist(), n_folds=1)
    coint_dict = {}
    for ind, cmp_symbol in enumerate(rank_head.index):
        klpd_cmp = mul_pd[cmp_symbol]
        if klpd_cmp is None:
            continue
        _, pvalue, _ = coint(kl_pd.close, klpd_cmp.close)
        if pvalue < 0.08:
            """
                记录index为了发现取多少个sort_values(ascending=True)[1:100]能
                有良好的数据
            """
            coint_dict[cmp_symbol] = (pvalue, ind)
    p_value_sorted = sorted(zip(coint_dict.values(), coint_dict.keys()))

    cmp_cnt = np.minimum(len(p_value_sorted), 10)
    symbols = [item[1] for item in p_value_sorted[:cmp_cnt]]

    mul_pd_it = mul_pd.swapaxes('items', 'minor')
    sd = mul_pd_it.items.tolist()
    sd.remove('close')
    """
        为了得到三维面板中干净的close列
    """
    close_panel = mul_pd_it.drop(sd)
    close_panel_pd = close_panel.loc['close'][symbols]

    if show:
        close_panel_pd_regular = NpUtil.regular_std(close_panel_pd)
        close_panel_pd_regular.plot()
        plt.title('close panel pd regular')
        plt.legend(bbox_to_anchor=(1.05, 1), loc=2, borderaxespad=0.)
        plt.show()

    close_panel_pd_cp = copy.deepcopy(close_panel_pd)

    for col in close_panel_pd_cp.columns:
        """
            做一个一摸一样的pd就是为了得到投票权重表便于运算
        """
        close_panel_pd_cp[col] = kl_pd.close
    regular_diff = NpUtil.regular_std(close_panel_pd_cp - close_panel_pd)

    if show:
        regular_diff.plot()
        plt.title('regular diff')
        plt.legend(bbox_to_anchor=(1.05, 1), loc=2, borderaxespad=0.)
        plt.show()

    """
        类似投票机制,获取投票coint的差值
            distance_votes
        2015-07-27   -14.724491
        2015-07-28   -12.712066
        2015-07-29   -11.945266
        2015-07-30   -13.801350
        2015-07-31   -13.520431
        2015-08-03   -11.381343
        2015-08-04    -9.486645
        2015-08-05   -11.319338
        2015-08-06    -6.517725
        2015-08-07    -9.103014
        2015-08-10    -5.025694
        ......................
    """
    distance_votes = regular_diff.sum(axis=1)
    votes_std = distance_votes.std()
    votes_mean = distance_votes.mean()
    above = votes_mean + votes_std
    below = votes_mean - votes_std
    if show:
        close_regular = NpUtil.regular_std(kl_pd.close)
        close_regular = (close_regular * distance_votes.max() / 2)
        close_regular.plot()
        distance_votes.plot()

        plt.axhline(votes_mean, color='r')
        plt.axhline(above, color='c')
        plt.axhline(below, color='g')

        plt.title('coint distance votes')
        plt.legend(['close regular', 'distance votes', 'votes mean', 'dvotes above', 'dvotes below'],
                   bbox_to_anchor=(1.05, 1), loc=2, borderaxespad=0.)
        plt.show()
Esempio n. 37
0
 def __init__(self):
     self.coint_t = coint(self.y1, self.y2, regression ="c")[0]
     self.teststat = -1.8208817
import pandas as pd
import statsmodels.tsa.stattools as ts

df = pd.read_excel('data.xlsx', index_col='Dates')
df['GTII10'] = df['GTII10']*-1

date_1, date_2 = '2016-01-01', '2017-01-01'
df_filtered = df.loc[date_1:date_2]

print(ts.coint(df_filtered['GTII10'], df_filtered['GOLDS'])[1])
                   parse_dates='Date',
                   index_col='Date')

X = data['ADANIPOWER']
Y = data['JSWENERGY']

Y.tail(5)
X = X['2014-01-01':'2014-12-31']
Y = Y['2014-01-01':'2014-12-31']

pd.concat([X, Y], axis=1).plot()

(Y / X).plot()
plt.axhline((Y / X).mean(), color='red', linestyle='--')

score, pvalue, _ = coint(X, Y)
print pvalue


def zscore(series):
    return (series - series.mean()) / np.std(series)


ratio_series = Y / X
zscore(ratio_series).plot()
plt.axhline(zscore(ratio_series).mean(), color='black')
plt.axhline(1.0, color='red', linestyle='--')
plt.axhline(-1.0, color='green', linestyle='--')

ratio = Y / X
ratio.name = 'ratio'
Esempio n. 40
0
 def setup_class(cls):
     #cls.coint_t = coint(cls.y1, cls.y2, trend="c")[0]
     cls.coint_t = coint(cls.y1, cls.y2, trend="c", maxlag=0,
                         autolag=None)[0]
     cls.teststat = -1.8208817
     cls.teststat = -1.830170986148
Esempio n. 41
0
    def checkPortfolioForCoint(self,
                               critValue=0.01,
                               fromDate="2015-01-01",
                               toDate="2020-09-21",
                               calcMean=False,
                               usead=0):
        """
        This is one of the most useful function in this class
        :param critValue: The critical value of which to judge things are cointegrated, equivalent to the p value cut off
        :param fromDate: The date to start the check of cointegration test from
        :param toDate: The date to end the check of cointegration test
        :param calcMean: Sometimes it is handy to return the mean of the cointegrated data, this gives a boolean for doing that
        :param usead: I tested other cointegration methods and this allows you to adjust which one to use
        :return: The dataframe of (symbol1, symbol2, pvalue), so the 2 cointegrated stocks and the certainty of which they are cointegrated
        """
        start_time = time.time()
        num_stocks = len(self.portfolio)
        keys = list(self.portfolio.keys())
        df = pd.DataFrame(columns=('symbol1', 'symbol2', 'pvalue'))
        for i in range(num_stocks):
            for j in range(i + 1, num_stocks):
                data1 = self.portfolio[keys[i]][
                    self.analysisOn][fromDate:toDate]
                data2 = self.portfolio[keys[j]][
                    self.analysisOn][fromDate:toDate]
                model = sm.OLS(data1, sm.add_constant(data2))
                results = model.fit()
                spread = data1 - results.params[1] * data2 - results.params[0]
                if results.params[1] < 0:
                    continue
                try:
                    if not usead:
                        result = coint(data1, data2)
                    else:
                        result = adfuller(spread)
                    # else:
                    #     df = pd.DataFrame({'data1': data1, 'data2': data2})
                    #     result = coint_johansen(df, 0, 1)
                    #     print("Critical values(90%, 95%, 99%) of max_eig_stat\n", result.cvm, '\n')
                    #     print("Critical values(90%, 95%, 99%) of trace_stat\n", result.cvt, '\n')

                except:
                    print("Cannot calculate coint for {}, {}".format(
                        keys[i], keys[j]))
                    continue
                pvalue = result[1]
                if pvalue < critValue:
                    mean = np.mean(spread)
                    std = np.std(spread)
                    df = df.append(
                        {
                            'symbol1': keys[i],
                            'symbol2': keys[j],
                            'pvalue': pvalue,
                            'mean': mean,
                            'std': std
                        },
                        ignore_index=True)
                    # At some point add sm.OLS HERE

        df = df.sort_values(by='pvalue', ignore_index=True)
        self.cointStocks = df
        print("checkPortfolioForCoint took {} seconds".format(time.time() -
                                                              start_time))
        if calcMean == False:
            return df
Esempio n. 42
0
def coint_similar(symbol, sum_rank=None, corr_jobs=(ECoreCorrType.E_CORE_TYPE_PEARS,
                                                    ECoreCorrType.E_CORE_TYPE_SPERM), show=True):
    """
    首先找到的是最相关的top个,从top n个最相关的再找协整,只考虑pvalue,因为已经是从top n个最相关的再找协整
    可视化整个过程

    :param symbol: eg: 'usTSLA'
    :param sum_rank: 已经缓存了的sum_rank数据,
                        eg: sum_rank
                        usBIDU           10.0
                        usFB             16.0
                        usGOOG           12.0
                        usNOAH            2.0
                        usSFUN           14.0
                        usTSLA           18.0
                        usVIPS            6.0
                        usWUBA            8.0
    :param corr_jobs: 默认:corr_jobs=(ECoreCorrType.E_CORE_TYPE_PEARS, ECoreCorrType.E_CORE_TYPE_SPERM)
                      可以再添加更多jobs
                      eg:
                        corr_jobs=(ECoreCorrType.E_CORE_TYPE_PEARS, ECoreCorrType.E_CORE_TYPE_SPERM,
                                   ECoreCorrType.E_CORE_TYPE_SIGN, ECoreCorrType.E_CORE_TYPE_ROLLING)
                      注意每添加一种相关计算方法,耗时都会增加
    :param show: 是否进行可视化
    """
    cs = code_to_symbol(symbol)
    symbol = cs.value
    if sum_rank is None:
        tmp_market = ABuEnv.g_market_target
        # 强制把市场设置为一样的
        ABuEnv.g_market_target = cs.market
        corr_df_dict = ABuSimilar.multi_corr_df(corr_jobs)
        # 恢复之前的市场
        ABuEnv.g_market_target = tmp_market
        """
            eg: corr_df_dict
            {'pears':
                            usBIDU    usFB  usGOOG  usNOAH  usSFUN  usTSLA  usVIPS  usWUBA
            usBIDU         1.0000  0.3013  0.3690  0.4015  0.3680  0.3015  0.3706  0.4320
            usFB           0.3013  1.0000  0.6609  0.2746  0.1978  0.4080  0.2856  0.2438
            usGOOG         0.3690  0.6609  1.0000  0.3682  0.1821  0.3477  0.3040  0.2917
            usNOAH         0.4015  0.2746  0.3682  1.0000  0.3628  0.2178  0.4645  0.4488
            usSFUN         0.3680  0.1978  0.1821  0.3628  1.0000  0.2513  0.2843  0.4883
            usTSLA         0.3015  0.4080  0.3477  0.2178  0.2513  1.0000  0.2327  0.3340
            usVIPS         0.3706  0.2856  0.3040  0.4645  0.2843  0.2327  1.0000  0.4189
            usWUBA         0.4320  0.2438  0.2917  0.4488  0.4883  0.3340  0.4189  1.0000

            'sperm':
                            usBIDU    usFB  usGOOG  usNOAH  usSFUN  usTSLA  usVIPS  usWUBA
            usBIDU         1.0000  0.3888  0.4549  0.4184  0.3747  0.3623  0.4333  0.4396
            usFB           0.3888  1.0000  0.7013  0.2927  0.2379  0.4200  0.3123  0.2216
            usGOOG         0.4549  0.7013  1.0000  0.3797  0.2413  0.3871  0.3922  0.3035
            usNOAH         0.4184  0.2927  0.3797  1.0000  0.3581  0.2066  0.4643  0.4382
            usSFUN         0.3747  0.2379  0.2413  0.3581  1.0000  0.2645  0.3890  0.4693
            usTSLA         0.3623  0.4200  0.3871  0.2066  0.2645  1.0000  0.2540  0.2801
            usVIPS         0.4333  0.3123  0.3922  0.4643  0.3890  0.2540  1.0000  0.4080
            usWUBA         0.4396  0.2216  0.3035  0.4382  0.4693  0.2801  0.4080  1.0000 }
        """
        sum_rank = rank_corr_sum(corr_df_dict, symbol)
        """
            eg: sum_rank
            usBIDU           10.0
            usFB             16.0
            usGOOG           12.0
            usNOAH            2.0
            usSFUN           14.0
            usTSLA           18.0
            usVIPS            6.0
            usWUBA            8.0
        """
        if sum_rank is None:
            logging.info('{} not in corr df!!!'.format(symbol))
            return None, None

    top_cnt = sum_rank.shape[0] if g_top_corr_cnt > sum_rank.shape[0] else g_top_corr_cnt
    # 首先找到的是最相关的top个
    rank_head = sum_rank.sort_values(ascending=True)[1:top_cnt]

    # 使用symbol做标尺
    benchmark = AbuBenchmark(symbol, n_folds=1)
    # benchmark做为数据标尺获取最相关的top个金融时间数据
    mul_pd = ABuSymbolPd.make_kl_df(rank_head.index, n_folds=1,
                                    data_mode=EMarketDataSplitMode.E_DATA_SPLIT_UNDO,
                                    benchmark=benchmark)

    coint_dict = {}
    for ind, cmp_symbol in enumerate(rank_head.index):
        if cmp_symbol not in mul_pd:
            continue

        klpd_cmp = mul_pd[cmp_symbol]
        if klpd_cmp is None:
            continue

        """
                coint返回值三个如下:
                coint_t : float
                    t-statistic of unit-root test on residuals
                pvalue : float
                    MacKinnon's approximate p-value based on MacKinnon (1994)
                crit_value : dict
                    Critical values for the test statistic at the 1 %, 5 %, and 10 %
                    levels.

                这里只考虑pvalue,因为已经是从top n个最相关的再找协整
        """
        _, pvalue, _ = coint(benchmark.kl_pd.close, klpd_cmp.close)
        if pvalue < g_coint_threshold:
            # pvalue小于阀值即加入coint_dict字典

            # 记录ind为了发现取多少个sort_values(ascending=True)[1:g_top_corr_cnt]能有良好的数据
            # 即为了事后调整g_top_corr_cnt使用,并非实际需要
            coint_dict[cmp_symbol] = (pvalue, ind)
    p_value_sorted = sorted(zip(coint_dict.values(), coint_dict.keys()))
    if len(p_value_sorted) == 0:
        logging.info(
            'len(p_value_sorted) == 0 please try change tl.similar.g_top_corr_cnt|tl.similar.g_coint_threshold!')
        return None, None

    if show:
        cmp_cnt = np.minimum(len(p_value_sorted), g_coint_show_max)
        # 只取item[1],[0]是ind
        symbols = [item[1] for item in p_value_sorted[:cmp_cnt]]
        mul_pd_swap = mul_pd.swapaxes('items', 'minor')
        close_panel_pd = mul_pd_swap['close'][symbols]
        """
            转轴后只取收盘价格序列
            eg: close_panel_pd
                          usFB  usGOOG  usNOAH  usVIPS  usWUBA  us_NYSE:.IXIC
            2015-07-24   96.95  623.56   23.40  20.250   65.25       5088.629
            2015-07-27   94.17  627.26   22.16  19.990   62.89       5039.776
            2015-07-28   95.29  628.00   22.94  20.200   60.32       5089.207
            2015-07-29   96.99  631.93   23.35  20.260   59.89       5111.730
            2015-07-30   95.21  632.59   22.87  19.700   60.24       5128.785
            ...            ...     ...     ...     ...     ...            ...
            2016-07-20  121.92  741.19   25.11  13.630   48.17       5089.930
            2016-07-21  120.61  738.63   25.51  13.690   49.25       5073.900
            2016-07-22  121.00  742.74   25.50  13.510   49.21       5100.160
            2016-07-25  121.63  739.77   25.57  13.390   49.84       5097.628
            2016-07-26  121.64  740.92   24.75  13.655   50.36       5084.629
        """
        # 将数据scale到一个级别上,注意使用mean_how=True,避免极值的干扰
        close_panel_pd = ABuScalerUtil.scaler_matrix(close_panel_pd, mean_how=True)
        """
            ABuScalerUtil.scaler_matrix缩放后的数据矩阵如下所示
            eg: close_panel_pd
                             usFB     usGOOG     usNOAH     usVIPS     usWUBA
            2015-07-24  4451.7674  4311.1198  4477.3494  6601.2284  5980.4246
            2015-07-27  4324.1148  4336.7006  4240.0882  6516.4719  5764.1211
            2015-07-28  4375.5432  4341.8168  4389.3332  6584.9290  5528.5703
            2015-07-29  4453.6041  4368.9877  4467.7825  6604.4882  5489.1591
            ...               ...        ...        ...        ...        ...
            2016-07-20  5598.3443  5124.3808  4804.5404  4443.1972  4414.9740
            2016-07-21  5538.1915  5106.6817  4881.0762  4462.7564  4513.9603
            2016-07-22  5556.0995  5135.0971  4879.1628  4404.0788  4510.2942
            2016-07-25  5585.0280  5114.5633  4892.5566  4364.9604  4568.0362
            2016-07-26  5585.4872  5122.5141  4735.6581  4451.3468  4615.6963
        """
        # 可视化scaler_matrix操作后的close
        close_panel_pd.plot(figsize=ABuEnv.g_plt_figsize)
        plt.title('close panel pd scaler_matrix')
        plt.legend(bbox_to_anchor=(1.05, 1), loc=2, borderaxespad=0.)
        plt.show()

        close_panel_pd_cp = copy.deepcopy(close_panel_pd)

        for col in close_panel_pd_cp.columns:
            """
                做一个一摸一样的pd就是为了得到投票权重表便于运算:

                     close_panel_pd_cp[col] = benchmark.kl_pd.close

                将所有数据列都使用标尺的数据进行替换,结果是每一列的数据都相同,
                比如这样,列数据都和标尺一样
                            usFB  usGOOG  usNOAH  usVIPS  usWUBA  us_NYSE:.IXIC
                2015-07-24  265.41  265.41  265.41  265.41  265.41         265.41
                2015-07-27  253.01  253.01  253.01  253.01  253.01         253.01
                2015-07-28  264.82  264.82  264.82  264.82  264.82         264.82
                2015-07-29  263.82  263.82  263.82  263.82  263.82         263.82
                2015-07-30  266.79  266.79  266.79  266.79  266.79         266.79
                ...            ...     ...     ...     ...     ...            ...
                2016-07-20  228.36  228.36  228.36  228.36  228.36         228.36
                2016-07-21  220.50  220.50  220.50  220.50  220.50         220.50
                2016-07-22  222.27  222.27  222.27  222.27  222.27         222.27
                2016-07-25  230.01  230.01  230.01  230.01  230.01         230.01
                2016-07-26  225.93  225.93  225.93  225.93  225.93         225.93
            """
            close_panel_pd_cp[col] = benchmark.kl_pd.close
        """
            将复刻后的close_panel_pd_cp与原始close_panel_pd求差后,再进行scaler_std
            ABuScalerUtil.scaler_std(close_panel_pd_cp - close_panel_pd):

                  usFB  usGOOG  usNOAH  usVIPS  usWUBA  us_NYSE:.IXIC
            2015-07-24  0.9705  1.7793  0.7405 -1.6987 -1.9294        -1.0803
            2015-07-27  1.2277  1.6619  1.1473 -1.6270 -1.5697        -0.8853
            2015-07-28  1.1393  1.6826  0.8987 -1.6831 -1.1334        -1.0866
            2015-07-29  0.9629  1.5955  0.7550 -1.7035 -1.0656        -1.2124
            2015-07-30  1.1519  1.5906  0.9265 -1.5197 -1.1169        -1.2878
            ...            ...     ...     ...     ...     ...            ...
            2016-07-21 -1.5539 -0.8188 -0.0710  0.3755  0.5784        -1.2418
            2016-07-22 -1.5899 -0.9012 -0.0644  0.4354  0.5879        -1.3728
            2016-07-25 -1.6371 -0.8138 -0.0746  0.4819  0.4997        -1.3179
            2016-07-26 -1.6473 -0.8509  0.2018  0.3922  0.4085        -1.2702
        """
        regular_diff = ABuScalerUtil.scaler_std(close_panel_pd_cp - close_panel_pd)
        regular_diff.plot(figsize=ABuEnv.g_plt_figsize)
        plt.title('regular diff')
        plt.legend(bbox_to_anchor=(1.05, 1), loc=2, borderaxespad=0.)
        plt.show()
        """
            distance_votes = regular_diff.sum(axis=1):

            投票机制,获取投票coint的差值
                distance_votes
            distance_votes
            2015-07-24   -1.2181
            2015-07-27   -0.0451
            2015-07-28   -0.1825
            2015-07-29   -0.6682
            2015-07-30   -0.2555
                           ...
            2016-07-20   -2.5541
            2016-07-21   -2.7316
            2016-07-22   -2.9049
            2016-07-25   -2.8618
            2016-07-26   -2.7658
            ......................
        """
        distance_votes = regular_diff.sum(axis=1)

        votes_std = distance_votes.std()
        votes_mean = distance_votes.mean()
        above = votes_mean + votes_std
        below = votes_mean - votes_std
        close_regular = ABuScalerUtil.scaler_std(benchmark.kl_pd.close)
        close_regular = (close_regular * distance_votes.max() / 2)

        with plt_show():
            # noinspection PyUnresolvedReferences
            close_regular.plot()
            distance_votes.plot()

            plt.axhline(votes_mean, color='r')
            plt.axhline(above, color='c')
            plt.axhline(below, color='g')

            plt.title('coint distance votes')
            plt.legend(['close regular', 'distance votes', 'votes mean', 'dvotes above', 'dvotes below'],
                       bbox_to_anchor=(1.05, 1), loc=2, borderaxespad=0.)
    return p_value_sorted, sum_rank
Esempio n. 43
0
quotes = []

for idx, name in enumerate(names):
    quote = stock_data_by_name(conn, name)
    quote = quote[:115]
    series = pd.Series(quote['close'], dtype='float')
    s_dict = {'name': name, 'data': series}
    quotes.append(s_dict)

stock_cb = list(combinations(quotes, 2))
print("stock combinations length : {}".format(len(stock_cb)))
result = []
for stock_info in stock_cb:
    x = stock_info[0].get("data")
    x_name = stock_info[0].get("name")
    y = stock_info[1].get("data")
    y_name = stock_info[1].get("name")
    coin_result = ts.coint(x, y)
    adf = coin_result[0]
    p_value = coin_result[1]
    one_percent = coin_result[2][0]
    five_percent = coin_result[2][1]
    ten_percent = coin_result[2][2]
    if adf < 0.01 and adf < one_percent and adf < five_percent and adf < ten_percent:
        print("result: {}, stock tuple:{} ".format(coin_result,
                                                   (x_name, y_name)))
        result.append((x_name, y_name))
print(len(result))

if __name__ == '__main__':
    print("execute over")
Esempio n. 44
0
def test_coint():
    nobs = 200
    scale_e = 1
    const = [1, 0, 0.5, 0]
    np.random.seed(123)
    unit = np.random.randn(nobs).cumsum()
    y = scale_e * np.random.randn(nobs, 4)
    y[:, :2] += unit[:, None]
    y += const
    y = np.round(y, 4)

    for trend in []:#['c', 'ct', 'ctt', 'nc']:
        print('\n', trend)
        print(coint(y[:, 0], y[:, 1], trend=trend, maxlag=4, autolag=None))
        print(coint(y[:, 0], y[:, 1:3], trend=trend, maxlag=4, autolag=None))
        print(coint(y[:, 0], y[:, 2:], trend=trend, maxlag=4, autolag=None))
        print(coint(y[:, 0], y[:, 1:], trend=trend, maxlag=4, autolag=None))

    # results from Stata egranger
    res_egranger = {}
    # trend = 'ct'
    res = res_egranger['ct'] = {}
    res[0]  = [-5.615251442239, -4.406102369132,  -3.82866685109, -3.532082997903]
    res[1]  = [-5.63591313706, -4.758609717199, -4.179130554708, -3.880909696863]
    res[2]  = [-2.892029275027, -4.758609717199, -4.179130554708, -3.880909696863]
    res[3]  = [-5.626932544079,  -5.08363327039, -4.502469783057,   -4.2031051091]

    # trend = 'c'
    res = res_egranger['c'] = {}
    # first critical value res[0][1] has a discrepancy starting at 4th decimal
    res[0]  = [-5.760696844656, -3.952043522638, -3.367006313729, -3.065831247948]
    # manually adjusted to have higher precision as in other cases
    res[0][1] = -3.952321293401682
    res[1]  = [-5.781087068772, -4.367111915942, -3.783961136005, -3.483501524709]
    res[2]  = [-2.477444137366, -4.367111915942, -3.783961136005, -3.483501524709]
    res[3]  = [-5.778205811661, -4.735249216434, -4.152738973763, -3.852480848968]

    # trend = 'ctt'
    res = res_egranger['ctt'] = {}
    res[0]  = [-5.644431269946, -4.796038299708, -4.221469431008, -3.926472577178]
    res[1]  = [-5.665691609506, -5.111158174219,  -4.53317278104,  -4.23601008516]
    res[2]  = [-3.161462374828, -5.111158174219,  -4.53317278104,  -4.23601008516]
    res[3]  = [-5.657904558563, -5.406880189412, -4.826111619543, -4.527090164875]

    # The following for 'nc' are only regression test numbers
    # trend = 'nc' not allowed in egranger
    # trend = 'nc'
    res = res_egranger['nc'] = {}
    nan = np.nan  # shortcut for table
    res[0]  = [-3.7146175989071137, nan, nan, nan]
    res[1]  = [-3.8199323012888384, nan, nan, nan]
    res[2]  = [-1.6865000791270679, nan, nan, nan]
    res[3]  = [-3.7991270451873675, nan, nan, nan]

    for trend in ['c', 'ct', 'ctt', 'nc']:
        res1 = {}
        res1[0] = coint(y[:, 0], y[:, 1], trend=trend, maxlag=4, autolag=None)
        res1[1] = coint(y[:, 0], y[:, 1:3], trend=trend, maxlag=4,
                        autolag=None)
        res1[2] = coint(y[:, 0], y[:, 2:], trend=trend, maxlag=4, autolag=None)
        res1[3] = coint(y[:, 0], y[:, 1:], trend=trend, maxlag=4, autolag=None)

        for i in range(4):
            res = res_egranger[trend]

            assert_allclose(res1[i][0], res[i][0], rtol=1e-11)
            r2 = res[i][1:]
            r1 = res1[i][2]
            assert_allclose(r1, r2, rtol=0, atol=6e-7)
Esempio n. 45
0
def rebalance_pairs(context, data):
    if get_open_orders():
        return

    prices = data.history(context.futures_list, 'price', context.long_ma, '1d')

    for future_y, future_x in context.futures_pairs:
        Y = prices[future_y]
        X = prices[future_x]

        y_log = np.log(Y)
        x_log = np.log(X)

        pvalue = coint(y_log, x_log)[1]
        if pvalue > 0.10:
            log.info(
                '({} {}) no longer cointegrated, no new positions.'.format(
                    future_y.root_symbol,
                    future_x.root_symbol,
                ), )
            continue

        regression = sp.stats.linregress(
            x_log[-context.long_ma:],
            y_log[-context.long_ma:],
        )

        spreads = Y - (regression.slope * X)

        zscore = (np.mean(spreads[-context.short_ma:]) -
                  np.mean(spreads)) / np.std(spreads, ddof=1)

        future_y_contract, future_x_contract = data.current(
            [future_y, future_x],
            'contract',
        )

        context.current_weights[future_y_contract] = context.long_term_weights[
            future_y_contract.root_symbol]
        context.current_weights[future_x_contract] = context.long_term_weights[
            future_x_contract.root_symbol]

        hedge_ratio = regression.slope

        if context.inShort[(future_y.root_symbol,
                            future_x.root_symbol)] and zscore < 0.0:
            context.long_term_weights[future_y_contract.root_symbol] = 0
            context.long_term_weights[future_x_contract.root_symbol] = 0
            context.current_weights[
                future_y_contract] = context.long_term_weights[
                    future_y_contract.root_symbol]
            context.current_weights[
                future_x_contract] = context.long_term_weights[
                    future_x_contract.root_symbol]

            context.inLong[(future_y.root_symbol,
                            future_x.root_symbol)] = False
            context.inShort[(future_y.root_symbol,
                             future_x.root_symbol)] = False
            continue

        if context.inLong[(future_y.root_symbol,
                           future_x.root_symbol)] and zscore > 0.0:
            context.long_term_weights[future_y_contract.root_symbol] = 0
            context.long_term_weights[future_x_contract.root_symbol] = 0
            context.current_weights[
                future_y_contract] = context.long_term_weights[
                    future_y_contract.root_symbol]
            context.current_weights[
                future_x_contract] = context.long_term_weights[
                    future_x_contract.root_symbol]

            context.inLong[(future_y.root_symbol,
                            future_x.root_symbol)] = False
            context.inShort[(future_y.root_symbol,
                             future_x.root_symbol)] = False
            continue

        if zscore < -1.0 and (not context.inLong[(future_y.root_symbol,
                                                  future_x.root_symbol)]):
            # Only trade if NOT already in a trade
            y_target_contracts = 1
            x_target_contracts = hedge_ratio
            context.inLong[(future_y.root_symbol, future_x.root_symbol)] = True
            context.inShort[(future_y.root_symbol,
                             future_x.root_symbol)] = False

            (y_target_pct, x_target_pct) = computeHoldingsPct(
                y_target_contracts, x_target_contracts,
                future_y_contract.multiplier * Y[-1],
                future_x_contract.multiplier * X[-1])

            context.long_term_weights[
                future_y_contract.root_symbol] = y_target_pct
            context.long_term_weights[
                future_x_contract.root_symbol] = -x_target_pct
            context.current_weights[
                future_y_contract] = context.long_term_weights[
                    future_y_contract.root_symbol]
            context.current_weights[
                future_x_contract] = context.long_term_weights[
                    future_x_contract.root_symbol]
            continue

        if zscore > 1.0 and (not context.inShort[(future_y.root_symbol,
                                                  future_x.root_symbol)]):
            # Only trade if NOT already in a trade
            y_target_contracts = 1
            x_target_contracts = hedge_ratio

            context.inLong[(future_y.root_symbol,
                            future_x.root_symbol)] = False
            context.inShort[(future_y.root_symbol,
                             future_x.root_symbol)] = True

            (y_target_pct, x_target_pct) = computeHoldingsPct(
                y_target_contracts, x_target_contracts,
                future_y_contract.multiplier * Y[-1],
                future_x_contract.multiplier * X[-1])

            context.long_term_weights[
                future_y_contract.root_symbol] = -y_target_pct
            context.long_term_weights[
                future_x_contract.root_symbol] = x_target_pct
            context.current_weights[
                future_y_contract] = context.long_term_weights[
                    future_y_contract.root_symbol]
            context.current_weights[
                future_x_contract] = context.long_term_weights[
                    future_x_contract.root_symbol]
            continue

    adjusted_weights = pd.Series({
        k: v / (len(context.futures_pairs))
        for k, v in context.current_weights.items()
    })

    order_optimal_portfolio(
        opt.TargetWeights(adjusted_weights),
        constraints=[
            opt.MaxGrossExposure(1.0),
        ],
    )
    log.info('weights: ', adjusted_weights)
Esempio n. 46
0
 def setup_class(cls):
     #cls.coint_t = coint(cls.y1, cls.y2, trend="c")[0]
     cls.coint_t = coint(cls.y1, cls.y2, trend="c", maxlag=0, autolag=None)[0]
     cls.teststat = -1.8208817
     cls.teststat = -1.830170986148
Esempio n. 47
0
puntuacion = ((prices[etfs[1]] - prices[etfs[0]]) - (prices[etfs[1]] - prices[etfs[0]]).mean()) / np.std((prices[etfs[1]] - prices[etfs[0]]))
puntuacion.plot()
plt.axhline(puntuacion.mean())
plt.axhline(1.0, color='red')
plt.axhline(-1.0, color='green')
plt.title('Zscore of the spread')
plt.show()

prices_train = prices.loc[start_train_date:start_date]
prices_train = prices_train.drop(prices_train.index[-1], axis = 0)
prices_test = prices.loc[start_date:end_date]

visual_coint(prices_train[etfs[0]],prices_train[etfs[1]])
visual_coint(prices_test[etfs[0]],prices_test[etfs[1]])

result = ts.coint(prices_train[etfs[0]],prices_train[etfs[1]]) # get conintegration
print(result)
pvalue = result[1] # get the pvalue
print(pvalue)


ganado = 0   
for exito in exitos:
    ganado += (exito[2]-1)
print('Dinero ganado: ', 100000*ganado)
perdido = 0   
for pifia in fracasos:
    perdido += (pifia[2]-1)
print('Dinero perdido: ', 100000*perdido)
print('Profit: ' , 100000*ganado + 100000*perdido)
Esempio n. 48
0
 def __init__(self):
     #self.coint_t = coint(self.y1, self.y2, trend="c")[0]
     self.coint_t = coint(self.y1, self.y2, trend="c", maxlag=0, autolag=None)[0]
     self.teststat = -1.8208817
     self.teststat = -1.830170986148
Esempio n. 49
0
def test_coint():
    nobs = 200
    scale_e = 1
    const = [1, 0, 0.5, 0]
    np.random.seed(123)
    unit = np.random.randn(nobs).cumsum()
    y = scale_e * np.random.randn(nobs, 4)
    y[:, :2] += unit[:, None]
    y += const
    y = np.round(y, 4)

    for trend in []:  #['c', 'ct', 'ctt', 'nc']:
        print('\n', trend)
        print(coint(y[:, 0], y[:, 1], trend=trend, maxlag=4, autolag=None))
        print(coint(y[:, 0], y[:, 1:3], trend=trend, maxlag=4, autolag=None))
        print(coint(y[:, 0], y[:, 2:], trend=trend, maxlag=4, autolag=None))
        print(coint(y[:, 0], y[:, 1:], trend=trend, maxlag=4, autolag=None))

    # results from Stata egranger
    res_egranger = {}
    # trend = 'ct'
    res = res_egranger['ct'] = {}
    res[0] = [
        -5.615251442239, -4.406102369132, -3.82866685109, -3.532082997903
    ]
    res[1] = [
        -5.63591313706, -4.758609717199, -4.179130554708, -3.880909696863
    ]
    res[2] = [
        -2.892029275027, -4.758609717199, -4.179130554708, -3.880909696863
    ]
    res[3] = [-5.626932544079, -5.08363327039, -4.502469783057, -4.2031051091]

    # trend = 'c'
    res = res_egranger['c'] = {}
    # first critical value res[0][1] has a discrepancy starting at 4th decimal
    res[0] = [
        -5.760696844656, -3.952043522638, -3.367006313729, -3.065831247948
    ]
    # manually adjusted to have higher precision as in other cases
    res[0][1] = -3.952321293401682
    res[1] = [
        -5.781087068772, -4.367111915942, -3.783961136005, -3.483501524709
    ]
    res[2] = [
        -2.477444137366, -4.367111915942, -3.783961136005, -3.483501524709
    ]
    res[3] = [
        -5.778205811661, -4.735249216434, -4.152738973763, -3.852480848968
    ]

    # trend = 'ctt'
    res = res_egranger['ctt'] = {}
    res[0] = [
        -5.644431269946, -4.796038299708, -4.221469431008, -3.926472577178
    ]
    res[1] = [-5.665691609506, -5.111158174219, -4.53317278104, -4.23601008516]
    res[2] = [-3.161462374828, -5.111158174219, -4.53317278104, -4.23601008516]
    res[3] = [
        -5.657904558563, -5.406880189412, -4.826111619543, -4.527090164875
    ]

    # The following for 'nc' are only regression test numbers
    # trend = 'nc' not allowed in egranger
    # trend = 'nc'
    res = res_egranger['nc'] = {}
    nan = np.nan  # shortcut for table
    res[0] = [-3.7146175989071137, nan, nan, nan]
    res[1] = [-3.8199323012888384, nan, nan, nan]
    res[2] = [-1.6865000791270679, nan, nan, nan]
    res[3] = [-3.7991270451873675, nan, nan, nan]

    for trend in ['c', 'ct', 'ctt', 'nc']:
        res1 = {}
        res1[0] = coint(y[:, 0], y[:, 1], trend=trend, maxlag=4, autolag=None)
        res1[1] = coint(y[:, 0],
                        y[:, 1:3],
                        trend=trend,
                        maxlag=4,
                        autolag=None)
        res1[2] = coint(y[:, 0], y[:, 2:], trend=trend, maxlag=4, autolag=None)
        res1[3] = coint(y[:, 0], y[:, 1:], trend=trend, maxlag=4, autolag=None)

        for i in range(4):
            res = res_egranger[trend]

            assert_allclose(res1[i][0], res[i][0], rtol=1e-11)
            r2 = res[i][1:]
            r1 = res1[i][2]
            assert_allclose(r1, r2, rtol=0, atol=6e-7)
Esempio n. 50
0
# In[305]:

(Y-X).plot() # Plot the spread
plt.axhline((Y-X).mean(), color='red', linestyle='--') # Add the mean


# ##Testing for Cointegration
# 
# That's an intuitive definition, but how do we test for this statisitcally? There is a convenient test that lives in `statsmodels.tsa.stattools`. We should see a very low p-value, as we've artifically created two series that are as cointegrated as physically possible.

# In[306]:

# compute the p-value of the cointegration test
# will inform us as to whether the spread btwn the 2 timeseries is stationary
# around its mean
score, pvalue, _ = coint(X,Y)
print pvalue


# ### Correlation vs. Cointegration
# 
# Correlation and cointegration, while theoretically similar, are not the same. To demonstrate this, we'll show examples of series that are correlated, but not cointegrated, and vice versa. To start let's check the correlation of the series we just generated.

# In[307]:

X.corr(Y)


# That's very high, as we would expect. But how would two series that are correlated but not cointegrated look? 
# 
# ###Correlation Without Cointegration