if __name__ == "__main__": """ step 1. reading and filtering the corppus of the source and target language """ if Amazon_reviews: docs_source, labels_source = read_documents( urpath + 'data/Amazon_data/amazon-English-EntireData.txt') docs_target, labels_target = read_documents( urpath + 'data/Amazon_data/amazon-Swedish-first10%.txt') else: docs_source = read_documents_novel(urpath, "AFoolFree.txt") docs_target = read_documents_novel(urpath, "EnDåreFri.txt") f_source = filtering(stemmer_source, stop_words_source) filtered_docs_source = f_source.preprocessing(docs_source) f_target = filtering(stemmer_target, stop_words_target) filtered_docs_target = f_target.preprocessing(docs_target) """ step 2. merging both langauges to build a common word embeddings """ filtered_docs_merged = [] filtered_docs_merged.extend(filtered_docs_source) filtered_docs_merged.extend(filtered_docs_target) tockenized_docs = [ ] # we need to tockenize the documents to build word vector in Fasttext [tockenized_docs.append(doc.split(None)) for doc in filtered_docs_merged] joint_modelvw = FastText(tockenized_docs, size=dim_trained_embedding, window=5,
self.A = A self.B1 = B1 self.mu1v = mu1v self.sig1v = sig1v self.mu1h = mu1h self.sig1h = sig1h self.pstgstm1ctm1 = pstgstm1ctm1 self.ps1 = ps1 self.muh1 = muh1 self.sigh1 = sigh1 p = P_testing(B0, mu0v, sig0v, mu0h, sig0h, A, B1, mu1v, sig1v, mu1h, sig1h, pstgstm1ctm1, ps1, muh1, sigh1) f, F, w, alpha, loglik, reset_prob = filtering(p, V, 5) s0 = [] s1 = [] s2 = [] s3 = [] for t in range(T): s0.append(sum(w[t][0])) s1.append(sum(w[t][1])) s2.append(sum(w[t][2])) s3.append(sum(w[t][3])) """ plt.plot(s0) plt.title('S0') plt.show() plt.plot(s1)
if Amazon_reviews: docs, labels = read_documents( urpath + 'data/Amazon_data/amazon-English-EntireData.txt') """ Alternative: smaller corpus: data/Amazon_data/amazon-English-first10%.txt data/Amazon_data/amazon-English-second10%.txt data/Amazon_data/amazon-Swedish-first10%.txt data/Amazon_data/amazon-Swedish-second10%.txt amazon-Persian-first10%.txt # Use another pre-processings method for Persian corpus """ else: docs = read_documents_novel( urpath, ".txt") # .csv, .pdf,... or type the name a file """ Pre-processing the data such as filtering/cleaning etc. """ f = filtering(stemmer, stop_words) filtered_docs = f.preprocessing(docs) with open(urpath + "temporary_files/docs_filtered.txt", "w") as output: for doc in filtered_docs: output.write('%s\n' % doc) corpus = Corpus() if Amazon_reviews: corpus.load_text(urpath + "temporary_files/docs_filtered.txt", valid_split=1 - rate_usageOfData_Amazon) else: corpus.load_text(urpath + "temporary_files/docs_filtered.txt", valid_split=1 - rate_usageOfData_novels)
df = pd.read_csv('btc_15m_3days.csv') print(df) V = np.array([df.mvavg_3h.to_list()]) V = 1000000 * (V - V.mean()) V = V[0].tolist() plt.plot(V) plt.show() T = np.size(V) dh = 1 S = 2 p = P(2, 1) V = np.array([V]) print('V', V) f, F, w, alpha, loglik = filtering(p, V, 2) w_1 = np.zeros(T) w_2 = np.zeros(T) for t in range(T): w_1[t], w_2[t] = np.sum(w[t], axis=1) print('w_1', w_1) print('w_2', w_2) print('w_1+w_2', w_1 + w_2) plt.plot(w_1, c='b') plt.plot(w_2, c='r') plt.show() """ x,beta = RTSLinearSmoother(p,V,f,F,w,2) print(x)
self.mu0h = mu0h self.sig0h = sig0h self.A = A self.B1 = B1 self.mu1v = mu1v self.sig1v = sig1v self.mu1h = mu1h self.sig1h = sig1h self.pstgstm1ctm1 = pstgstm1ctm1 self.ps1 = ps1 self.muh1 = muh1 self.sigh1 = sigh1 p = P_testing(B0, mu0v,sig0v,mu0h,sig0h,A,B1,mu1v,sig1v,mu1h,sig1h,pstgstm1ctm1,ps1,muh1,sigh1) numgaussians = 3 f, F, w, alpha, loglik, reset_prob = filtering(p,V,numgaussians) x,beta = RTSLinearSmoother(p,V,f,F,w,numgaussians) print(x) mass_aprox = np.zeros(shape = [S,T]) print('-----') for t in range(T): for s in range(S): print(sum(sum(x[t][s]))) #mass_aprox[s, t] = #mass_aprox[s,t] = sum(sum(x[t][s])) if sum(sum(x[t][s])) <=1 else 1 mass_aprox[s, t] = sum(sum(x[t][s])) print(mass_aprox) print(np.sum(mass_aprox,axis = 0))
def test_tr_mr(sig0v_val, abs_mu1h, sig1h_val): mu1h = np.empty(shape=[2, 1], dtype=object) mu1h[0] = np.array([[abs_mu1h]]) mu1h[1] = np.array([[-abs_mu1h]]) mu0h = np.empty(shape=[2, 1], dtype=object) mu0h[0] = np.array([[0]]) mu0h[1] = np.array([[0]]) sig0v = [sig0v_val, sig0v_val] sig1h = np.empty(shape=[2, 1, 1], dtype=object) sig1h[0] = np.array([[sig1h_val]]) sig1h[1] = np.array([[sig1h_val]]) B0 = np.array([[1], [1]]) mu0v = [0, 0] sig0h = np.empty(shape=[2, 1, 1], dtype=object) sig0h[0] = np.array([[.01]]) sig0h[1] = np.array([[.01]]) A = np.empty(shape=[2, 1, 1], dtype=object) A[0] = np.array([[1]]) A[1] = np.array([[1]]) B1 = np.empty(shape=[2, 1], dtype=object) B1[0] = np.array([[1]]) B1[1] = np.array([[1]]) mu1v = np.array([0, 0]) sig1v = np.array([.01, .01]) pstgstm1ctm1 = np.empty(shape=[2, 2, 2]) pstgstm1ctm1[:, :, 0] = np.array([[0.95, .05], [0.05, 0.95]]) pstgstm1ctm1[:, :, 1] = np.array([[1, 1.0000e-06], [1.0000e-06, 1]]) ps1 = np.array([[.964, .036]]) muh1 = np.empty(shape=[2, 1], dtype=object) muh1[0] = np.array([[.5]]) muh1[1] = np.array([[-.5]]) sigh1 = np.empty(shape=[2, 1, 1]) sigh1[0] = np.array([[.1]]) sigh1[1] = np.array([[.1]]) p = P_testing(B0, mu0v, sig0v, mu0h, sig0h, A, B1, mu1v, sig1v, mu1h, sig1h, pstgstm1ctm1, ps1, muh1, sigh1) f, F, w, alpha, loglik, reset_prob = filtering(p, V, 8) s0 = [] s1 = [] for t in range(T): s0.append(sum(w[t][0])) s1.append(sum(w[t][1])) s0 = np.array(s0) s1 = np.array(s1) #plt.bar(range(np.size(s0)), s0,color='g', label='Trending', width=1) #plt.bar(range(np.size(s0)),s1,color='r', bottom=s0,label = 'Mean Reverting', width=1) #plt.title('Swap Spread - 30 Day Moving Average') #plt.legend(loc = 'lower left') #plt.show() abs_diff = df2['abs_spread_less_mvavg'].tolist() logreturns = df2['tr_mr'].tolist() ret = [] log_ret_trending = [] log_ret_mean_reverting = [] for i in range(T - 1): if s0[i] > s1[i]: #greater probability of trending ret.append(abs_diff[i + 1] - abs_diff[i]) #log_ret_trending.append(logreturns[i+1]) log_ret_trending.append(abs_diff[i + 1] - abs_diff[i]) else: # greater probability of mean reverting ret.append(abs_diff[i] - abs_diff[i + 1]) #log_ret_mean_reverting.append(logreturns[i+1]) log_ret_mean_reverting.append(abs_diff[i + 1] - abs_diff[i]) import seaborn as sns trending_mean = np.mean(log_ret_trending) mr_mean = np.mean(log_ret_mean_reverting) sns.distplot(log_ret_mean_reverting, hist=False, label='Mean Reverting', color='r') plt.axvline(mr_mean, color='r', linestyle='--', label='mean reverting mean') plt.title('Mean Reverting Return Dist') plt.axvline(trending_mean, color='g', linestyle='--', label='trending mean') sns.distplot(log_ret_trending, hist=False, label="Trending", color='g') plt.title('Trending vs MR Return Dist') plt.show() print('mr mean', np.mean(log_ret_mean_reverting)) print('tr mean', np.mean(log_ret_trending)) print('ret', np.sum(ret)) print(len(abs_diff)) print(len(s0)) print(len(s1)) return ret
self.A = A self.B1 = B1 self.mu1v = mu1v self.sig1v = sig1v self.mu1h = mu1h self.sig1h = sig1h self.pstgstm1ctm1 = pstgstm1ctm1 self.ps1 = ps1 self.muh1 = muh1 self.sigh1 = sigh1 p = P_testing(B0, mu0v, sig0v, mu0h, sig0h, A, B1, mu1v, sig1v, mu1h, sig1h, pstgstm1ctm1, ps1, muh1, sigh1) f, F, w, alpha, loglik, reset = filtering(p, V, 3) x, beta = RTSLinearSmoother(p, V, f, F, w, 3) print('x[5]', x[5]) mass_aprox = np.zeros(shape=[S, T]) """ print('-----') for t in range(T): for s in range(S): print(sum(sum(x[t][s]))) #mass_aprox[s, t] = #mass_aprox[s,t] = sum(sum(x[t][s])) if sum(sum(x[t][s])) <=1 else 1 mass_aprox[s, t] = sum(sum(x[t][s])) print(mass_aprox) print(np.sum(mass_aprox,axis = 0))