def cmodel(company, dt1, dt2, num_of_states): quotes = quotes_historical_yahoo_ochl(company, dt1, dt2) #Here we set the time range # Unpack the quotes ! dates = np.array([q[0] for q in quotes], dtype=int) close_v = np.array([q[2] for q in quotes]) # Take diff of close value and shift by 1 diff = np.diff(close_v) dates = dates[1:] close_v = close_v[1:] # Pack diff for training. X = np.column_stack([diff]) # Create HMM instance and fit model = GaussianHMM(n_components=num_of_states, covariance_type="full", n_iter=1000).fit(X) #print ("Model Covars: ", model.covars_) expected_days = 1 tr_mls = 1 if (num_of_states > 1): #Identify the most likely last hidden state try: hidden_probs = model.predict_proba(X) except: model = GaussianHMM(n_components=num_of_states, covariance_type="diag", n_iter=1000).fit(X) hidden_probs = model.predict_proba(X) lstate_prob = hidden_probs[-1] mls = lstate_prob.argmax() # self transition probability for the most likely last hidden state tr_mls = model.transmat_[mls][mls] # we make use of the geometric series formula to calculate the number # of days expected to stay at the current state expected_days = (1.0 / (1 - tr_mls)) # we save the model for future use fname = str(company) + "_" + str(num_of_states) + "_states_model_final.pkl" joblib.dump(model, os.path.join('./sims_final', fname)) #return expected days return expected_days, tr_mls
def fit_hmm(turb_series): """ This module fits the HMM model And also outputs some of the model results such as the persistence probability and the transition probability A two state Gaussian model is used here """ a = turb_series.copy() hmm_model = GaussianHMM(n_components=2, covariance_type="full", n_iter=1000).fit(a) hidden_states = hmm_model.predict(a) initial_state = hidden_states[0] persistence_normal = hmm_model.transmat_[0][0] transition_normal = hmm_model.transmat_[0][1] mean_normal = hmm_model.means_[0][0] Std_Dev_normal = np.sqrt(hmm_model.covars_[0])[0][0] persistence_event = hmm_model.transmat_[1][1] transition_event = hmm_model.transmat_[1][0] mean_event = hmm_model.means_[1][0] Std_Dev_event = np.sqrt(hmm_model.covars_[1])[0][0] hmm_model_results = [initial_state,persistence_normal,transition_normal,\ mean_normal,Std_Dev_normal,persistence_event,\ transition_event,mean_event, Std_Dev_event] hidden_states = pd.DataFrame(hidden_states,\ columns=["NormalorEventClass"],index=a.index) posterior_prob = hmm_model.predict_proba(a) posterior_prob = pd.DataFrame(posterior_prob,columns=["Event.Prob",\ "Normal.Prob"],index=a.index) return pd.concat([a, posterior_prob, hidden_states], axis=1), hmm_model_results
def train(self, k, train_set, valid_set): train_wavs, train_folds, train_labels = zip(*list(chain(*train_set))) train_wavs, train_folds, train_labels = np.array(train_wavs), np.array( train_folds), np.array(train_labels) train_sample = len(train_wavs) train_x, _ = self.fix_frame(train_sample, train_wavs, train_folds, train_labels) # Test Model valid_wavs, valid_folds, valid_labels = zip(*valid_set) valid_wavs, valid_folds, valid_labels = np.array(valid_wavs), np.array( valid_folds), np.array(valid_labels) valid_sample = len(valid_wavs) valid_x, valid_y = self.fix_frame(valid_sample, valid_wavs, valid_folds, valid_labels) if config.isPCA: pca = PCA(n_components=config.n_pca) pca.fit(train_x) train_x = pca.transform(train_x) valid_x = pca.transform(valid_x) hmm = GaussianHMM(n_components=self.component) hmm.fit(train_x) joblib.dump(hmm, f"{self.model_path}/hmm10-{k}.pkl") score = purity_score(np.argmax(valid_y, axis=1), np.argmax(hmm.predict_proba(valid_x))) print('Accuracy:{0:.3f}'.format(score))
class GaussHMM: def __init__(self, init): self.init = init def fit(self, signals, channels): self.hmm = GaussianHMM(n_components=len(self.init), covariance_type="full", n_iter=100) self.hmm.fit(np.array(signals).reshape([-1, 1])[:100]) self.hmm.means_ = self.get_mean(signals, channels) self.hmm.covars_ = self.get_cov(signals, channels) self.hmm.startprob_ = self.init self.hmm.transmat_ = self.markov_p_trans(channels) def predict(self, signals): pred = self.hmm.predict(signals.reshape([-1, 1])) return pred def predict_proba(self, signals): prob = self.hmm.predict_proba(signals.reshape([-1, 1])).round(3) return prob def get_mean(self, signals, channels): sig_mean = [] for chan_i in range(len(np.unique(channels))): sig_mean.append(signals[channels == chan_i].mean()) return np.array(sig_mean).reshape([-1, 1]) def get_cov(self, signals, channels): sig_cov = [] for chan_i in range(len(np.unique(channels))): sig_cov.append(np.cov(signals[channels == chan_i])) return np.array(sig_cov).reshape([-1, 1, 1]) def markov_p_trans(self, states): max_state = np.max(states) states_next = np.roll(states, -1) matrix = [] for i in range(max_state + 1): current_row = np.histogram(states_next[states == i], bins=np.arange(max_state + 2))[0] if np.sum(current_row ) == 0: # if a state doesn't appear in states... current_row = np.ones(max_state + 1) / ( max_state + 1) # ...use uniform probability else: current_row = current_row / np.sum( current_row) # normalize to 1 matrix.append(current_row) return np.array(matrix)
def HHM_stock(stock,startdate,enddate,predict_startdate,predict_enddate,hmmcomponents=4,cov_type='full'): from hmmlearn.hmm import GMMHMM,GaussianHMM import datetime import numpy as np import pandas as pd import warnings def get_hmm_feature(stock, startdate, enddate): df = get_price(stock, start_date=startdate, end_date=enddate, frequency='1d', fields=['close','money','volume','high','low','open'],skip_paused=True) close = df['close'] high = df['high'][5:] low = df['low'][5:] volume = df['volume'][5:] opens= df['open'][5:] datelist = pd.to_datetime(close.index[5:]) logreturn = (np.log(np.array(close[1:]))-np.log(np.array(close[:-1])))[4:] logreturn5 = np.log(np.array(close[5:]))-np.log(np.array(close[:-5])) rangereturn = (np.log(np.array(high))-np.log(np.array(low))) closeidx = close[5:] rangereturn = (np.log(np.array(high))-np.log(np.array(low))) money = df['money'] money_ma5= pd.rolling_mean(money,4) money_ma5_rate= np.log(np.array(money[5:]))-np.log(np.array(money_ma5[4:-1])) return (closeidx,datelist,np.column_stack([logreturn,rangereturn,logreturn5,money_ma5_rate])) closeidx_fit,datelist_fit,data_fit = get_hmm_feature(stock, startdate, enddate) closeidx_pred,datelist_pred,data_predict = get_hmm_feature(stock, predict_startdate, predict_enddate) warnings.filterwarnings("ignore") # diag hmm = GaussianHMM(n_components = hmmcomponents, covariance_type=cov_type,n_iter = 5000).fit(data_fit) #latent_states_sequence = hmm.predict(data_fit) hidden_state_meaning = hhm_state2read(hmm) readable_state_hidden = {meaning:state for state,meaning in hidden_state_meaning.items()} _,predict_states_sequence = hmm.decode(data_predict) predict_all_scores_sequence = hmm.predict_proba(data_predict) predict_states_score_sequence = [predict_all_scores_sequence[idx][s] for idx,s in enumerate(predict_states_sequence)] hhm_score = pd.DataFrame(predict_all_scores_sequence,columns=[hidden_state_meaning[state] for state in range(hmm.n_components)],index=datelist_pred).applymap(lambda x:round(x,5)) hhm_pred = pd.DataFrame({'close':closeidx_pred ,"state":predict_states_sequence ,'score':predict_states_score_sequence ,'action':[hidden_state_meaning[s] for s in predict_states_sequence]},index=datelist_pred) #return pd.concat([hhm_pred,hhm_score],axis=1) return (hmm,hhm_pred)
class StockHMM: def __init__(self, stock=STOCK.Google): if stock == STOCK.Google: path = './data/GOOG.csv' elif stock == STOCK.Baidu: path = './data/BIDU.csv' elif stock == STOCK.Tencent: path = './data/TCEHY.csv' else: print('Invalid argument!') raise SystemError() # initialize data data, self.dates = self.get_data(path=path) self.open = data[:, 0] self.high = data[:, 1] self.low = data[:, 2] self.close = data[:, 3] self.adj_close = data[:, 4] self.volume = data[:, 5] # the number of stocks in stock transactions per day self.model = None # read data def get_data(self, path): f = open(path) lines = f.readlines() f.close() # the first line is the header lines = lines[1:] x = [] dates = [] for line in lines: data = np.double(line.split(',')[1:7]) dates.append(line.split(',')[0]) x.append(data) # [1] is the opening price return np.array(x), np.array(dates) # train model with nc hidden states from the first n (including) data def train(self, nc, n): features = self.features_extraction(n) self.model = GaussianHMM(n_components=nc, covariance_type="full", n_iter=2000).fit( features) # predict HMM models # extract features from first n (not including) data def features_extraction(self, n): assert 5 < n < self.high.shape[0] ld_hl = np.log(self.high) - np.log( self.low) # log difference of high and low ld_c5 = np.log(self.close[5:n]) - np.log( self.close[:n - 5]) # log difference of close (every 5 days) ld_v5 = np.log(self.volume[5:n]) - np.log(self.volume[:n - 5]) ld_hl = ld_hl[5:n] # concatenate to form features features = np.column_stack([ld_hl, ld_c5, ld_v5]) # dim: (n-5) * 3 return features # predict the states of the nth period def predict(self, n): features = self.features_extraction(n - 1) hidden_states_proba = self.model.predict_proba(features) states = hidden_states_proba[-1, :] return states.dot(self.model.transmat_)
# close_v = np.reshape(close_v, (1, close_v.shape[0])) print('_log_returns.shape', _log_returns.shape) print('dates.shape', dates.shape) print('close_v.shape', close_v.shape) # Pack _log_returns and volume for training. X = np.column_stack([_log_returns, volume]) print("fitting to HMM and decoding ...", end="") # Make an HMM instance and execute fit model = GaussianHMM(n_components=3, covariance_type="diag", n_iter=1000).fit(X) # Predict the optimal sequence of internal hidden state hidden_states = model.predict_proba(X) print('hidden_states.shape', hidden_states.shape) #exit() print("done") print("Transition matrix") print(model.transmat_) print() print("Means and vars of each hidden state") for i in range(model.n_components): print("{0}th hidden state".format(i)) print("mean = ", model.means_[i]) print("var = ", np.diag(model.covars_[i])) print()
test_data[:, 4] = dt.min_max_normalize( test_data[:, 4], method='tanh') #2*(test_data[:, 4]-min_vol)/(max_vol-min_vol)-1 hmm_input_train = np.column_stack([train_data[:, 5]]) hmm_input_test = np.column_stack([test_data[:, 5]]) if (save_model and os.path.isfile(hmm_model_file)): hmm_model = joblib.load(hmm_model_file) else: hmm_model = GaussianHMM(n_components=hmm_components, covariance_type="diag", n_iter=1000).fit(hmm_input_train) joblib.dump(hmm_model, hmm_model_file) hmm_train = hmm_model.predict_proba(hmm_input_train) hmm_test = hmm_model.predict_proba(hmm_input_test) if (False): ax1 = plt.subplot(2, 1, 1) ax1.plot(df['close'].values[train_rows:], label='Close') #plt.set_autoscaley_on(True) ax2 = plt.subplot(2, 1, 2) # ax2.plot(test_data[:, 7]) ax2.plot(hmm_test[:, 0], label='Hidden 0') ax2.plot(hmm_test[:, 1], label='Hidden 1') ax2.plot(hmm_test[:, 2], label='Hidden 1') ax2.set_ylim([0, 1]) plt.show()
model = model.fit(X) print("样本量:") print(X.shape) print("给定的隐藏特征数目:") print(n) print("初始的隐藏状态概率π:") print(model.startprob_) print("状态转移矩阵A参数:") print(model.transmat_) print("估计均值:") print(model.means_) print("估计方差:") print(model.covars_) print("预测的概率:") y = model.predict_proba(X) print(y) hidden_states = model.predict(X) print("预测状态值:") print(hidden_states) print(model.score(X)) # HMM模型只是能分离出不同的状态,具体对每个状态赋予现实的市场意义,是需要人为来辨别和观察的。 for j in range(len(close)-1): for i in range(model.n_components): if hidden_states[j] == i: plt.plot([dates[j], dates[j+1]], [close[j], close[j+1]], color=colors[i]) plt.show() # import pandas as pd # # data = pd.DataFrame({'datelist': dates, 'close': close, 'state': hidden_states}).set_index('dates')