def detect_events_hmm(mahal_timeseries, c_timeseries, global_pace_timeseries, threshold_quant=.95): #Sort the keys of the timeseries chronologically sorted_dates = sorted(mahal_timeseries) (expected_pace_timeseries, sd_pace_timeseries) = getExpectedPace(global_pace_timeseries) #Generate the list of values of R(t) mahal_list = [mahal_timeseries[d] for d in sorted_dates] c_list = [c_timeseries[d] for d in sorted_dates] global_pace_list = [global_pace_timeseries[d] for d in sorted_dates] expected_pace_list = [expected_pace_timeseries[d] for d in sorted_dates] #Use the quantile to determine the threshold sorted_mahal = sorted(mahal_list) threshold = getQuantile(sorted_mahal, threshold_quant) # The symbols array contains "1" if there is an outlier, "0" if there is not symbols = [] for i in range(len(mahal_list)): if(mahal_list[i] > threshold or c_list[i]==1): symbols.append(1) else: symbols.append(0) # Set up the hidden markov model. We are modeling the non-event states as "0" # and event states as "1" # Transition matrix with heavy weight on the diagonals ensures that the model # is likely to stick in the same state rather than rapidly switching. In other # words, the predictions will be relatively "smooth" trans_matrix = array([[.999, .001], [.001,.999]]) # Emission matrix - state 0 is likely to emit symbol 0, and vice versa # In other words, events are likely to be outliers emission_matrix = array([[.95, .05], [.4, .6]]) # Actually set up the hmm model = MultinomialHMM(n_components=2, transmat=trans_matrix) model.emissionprob_ = emission_matrix # Make the predictions lnl, predictions = model.decode(symbols) events = get_all_events(predictions, sorted_dates, mahal_list, global_pace_list, expected_pace_list) # Sort events by duration, starting with the long events events.sort(key = lambda x: x[2], reverse=True) return events, predictions
def run_hmm_model(input_df, n_unique, A_df, Eta, n_iter = 10000, tol=1e-2, verbose = False, params = 'e', init_params = ''): ''' Runs the hmm model and returns the predicted results, score and model input_df : The dataframe of keypresses n_unique : number of unqique chars A_df : Dataframe of trasnmission matrix Eta : Emissions matrix n_iter : Max number of iterations for hmm tol : The value to stop the hmm model if score does not improve by more than this verbose : Whether or not to print out params : Parameters to tune init_params : Paramters to initialize ''' # Propotion of characters starting words in english char_counts = get_char_counts() # Construct model hmm = MultinomialHMM(n_components=n_unique, startprob_prior=np.append(0, char_counts.values), transmat_prior=A_df.values, algorithm='viterbi', random_state=None, n_iter=n_iter, tol=tol, verbose=verbose, params=params, init_params=init_params) # Set values hmm.emissionprob_ = Eta hmm.transmat_ = A_df.values hmm.startprob_ = np.append(0, char_counts.values) # Feed in the clusters as the expected output model_input = input_df['cluster'].values # Reshape if len(model_input.shape) == 1: model_input = model_input.reshape((len(model_input), 1)) # Fit the model hmm = hmm.fit(model_input) # Score model score, results = hmm.decode(model_input) return score, results, hmm
def detect_events_hmm(mahal_timeseries, c_timeseries, global_pace_timeseries, threshold_quant=.95, trans_matrix = DEFAULT_TRANS_MATRIX, emission_matrix=DEFAULT_EMISSION_MATRIX, initial_state=None): #Sort the keys of the timeseries chronologically sorted_dates = sorted(mahal_timeseries) (expected_pace_timeseries, sd_pace_timeseries) = getExpectedPace(global_pace_timeseries) #Generate the list of values of R(t) mahal_list = [mahal_timeseries[d] for d in sorted_dates] c_list = [c_timeseries[d] for d in sorted_dates] global_pace_list = [global_pace_timeseries[d] for d in sorted_dates] expected_pace_list = [expected_pace_timeseries[d] for d in sorted_dates] #Use the quantile to determine the threshold sorted_mahal = sorted(mahal_list) threshold = getQuantile(sorted_mahal, threshold_quant) # The symbols array contains "1" if there is an outlier, "0" if there is not symbols = [] for i in range(len(mahal_list)): if(mahal_list[i] > threshold or c_list[i]==1): symbols.append(1) else: symbols.append(0) # Actually set up the hmm model = MultinomialHMM(n_components=2, transmat=trans_matrix, startprob=initial_state) model.emissionprob_ = emission_matrix # Make the predictions lnl, predictions = model.decode(symbols) events = get_all_events(predictions, sorted_dates, mahal_list, global_pace_list, expected_pace_list) # Sort events by duration, starting with the long events events.sort(key = lambda x: x[2], reverse=True) return events, predictions
def detect_events_hmm(mahal_timeseries, c_timeseries, global_pace_timeseries, threshold_quant=.95, trans_matrix=DEFAULT_TRANS_MATRIX, emission_matrix=DEFAULT_EMISSION_MATRIX, initial_state=None): #Sort the keys of the timeseries chronologically sorted_dates = sorted(mahal_timeseries) (expected_pace_timeseries, sd_pace_timeseries) = getExpectedPace(global_pace_timeseries) #Generate the list of values of R(t) mahal_list = [mahal_timeseries[d] for d in sorted_dates] c_list = [c_timeseries[d] for d in sorted_dates] global_pace_list = [global_pace_timeseries[d] for d in sorted_dates] expected_pace_list = [expected_pace_timeseries[d] for d in sorted_dates] #Use the quantile to determine the threshold sorted_mahal = sorted(mahal_list) threshold = getQuantile(sorted_mahal, threshold_quant) # The symbols array contains "1" if there is an outlier, "0" if there is not symbols = [] for i in range(len(mahal_list)): if (mahal_list[i] > threshold or c_list[i] == 1): symbols.append(1) else: symbols.append(0) # Actually set up the hmm model = MultinomialHMM(n_components=2, transmat=trans_matrix, startprob=initial_state) model.emissionprob_ = emission_matrix # Make the predictions lnl, predictions = model.decode(symbols) events = get_all_events(predictions, sorted_dates, mahal_list, global_pace_list, expected_pace_list) # Sort events by duration, starting with the long events events.sort(key=lambda x: x[2], reverse=True) return events, predictions
def test_DiscreteHMM_decode(cases: str) -> None: np.random.seed(12346) cases = int(cases) i = 1 N_decimal = 4 while i < cases: tol=1e-3 n_samples = np.random.randint(10, 50) hidden_states = np.random.randint(3, 6) # symbols is the number of unqiue observation types. symbols = np.random.randint(4, 9) X = [] lengths = [] for _ in range(n_samples): # the actual length is seq_length + 1 seq_length = symbols this_x = np.random.choice(range(symbols), size=seq_length, replace=False) X.append(this_x) lengths.append(seq_length) max_iter = 100 hmm_gold = MultinomialHMM(n_components=hidden_states, n_iter=100, tol=tol) X_gold = np.concatenate(X).reshape((-1,1)) hmm_gold.fit(X_gold, lengths) gold_A = hmm_gold.transmat_ gold_B = hmm_gold.emissionprob_ gold_pi = hmm_gold.startprob_ gold_logprob, gold_state_sequence = hmm_gold.decode(X_gold, lengths) hmm_mine = DiscreteHMM(hidden_states=hidden_states, symbols=symbols, A=gold_A, B=gold_B, pi=gold_pi) mine_logprob_list = [] mine_state_sequence = [] for this_x in X: this_mine_logprob, this_mine_state_sequence = hmm_mine.decode(this_x) mine_logprob_list.append(this_mine_logprob) mine_state_sequence.append(this_mine_state_sequence) mine_state_sequence = np.concatenate(mine_state_sequence) mine_logprob = sum(mine_logprob_list) assert_almost_equal(mine_logprob, gold_logprob, decimal=N_decimal) assert_almost_equal(mine_state_sequence, gold_state_sequence, decimal=N_decimal) i+=1 print('Successfully testing the function of computing decodes in discrete HMM!')
emission_probability[1][2] += 1 if train_data['return'][i] < 0.0 and analysis_data['v'][i] == 3: emission_probability[1][3] += 1 emission_probability[0] /= sum(1 for e in train_data['return'] if e >= 0.0) emission_probability[1] /= sum(1 for e in train_data['return'] if e < 0.0) #print(emission_probability) hmm = MultinomialHMM(n_components=n_states) hmm.startprob = start_probability hmm.transmat = transition_probability hmm.emissionprob = emission_probability bob_says = np.array([[0, 2, 1, 1, 2, 0]]).T hmm = hmm.fit(bob_says) logprob, alice_hears = hmm.decode(bob_says, algorithm="viterbi") print("Bob says:", ", ".join(map(lambda x: observations[x], bob_says))) print("Alice hears:", ", ".join(map(lambda x: states[x], alice_hears))) ''' law_data['hmm_states'] = hmm.predict(rets) panel = Figure_Util.Figure() panel.draw(law_data, title='close', subplots=['hmm_states'], figsize=(20, 10)) ''' db.disconnect()
for sent in test_sents: inp = [] for i in range(len(sent)): word = sent[i][0] try: k = list(emission_dict.keys()).index(word) except: nexcept += 1 k = emission_matrix.shape[0] - 1 inp.append(k) inp = np.atleast_2d(inp).T if (len(inp) != 1): logprob, out_ = model.decode(inp, algorithm='viterbi') out_ = list(map(lambda x: states[x], out_)) else: #print(sent[0][-1]) out_ = [] out_.append('O') for i in range(len(sent)): word = sent[i][0] gold = sent[i][-1] pred = out_[i] out.write("{}\t{}\t{}\n".format(word, gold, pred)) #j+=1 out.write("\n") print(nexcept)
model.startprob_ = np.array([1, 0, 0]) model.endprob_ = np.array([0, 0, 0.1]) model.transmat_ = np.array([[0.9, 0.1, 0], [0, 0, 1], [0, 0, 1]]) model.emissionprob_ = np.array([[0.25, 0.25, 0.25, 0.25], [0.05, 0, 0.95, 0], [0.4, 0.1, 0.1, 0.4]]) # In[121]: #"CTTCATGTGAAAGCAGACGTAAGTCA" A = 0 , C = 1 , G = 2 , T = 3 sequence = [ 1, 3, 3, 1, 0, 3, 2, 3, 2, 0, 0, 0, 2, 1, 0, 2, 0, 1, 2, 3, 0, 0, 2, 3, 1, 0 ] logprob, seq = model.decode(np.array([sequence]).transpose()) print(logprob) print(seq) # E = 0 , 5 = 1 , I = 2 print("following sequence correspond to :") print("EEEEEEEEEEEEEEEEEE5IIIIIII") # ## Question 1.2 : HMM # #### based on the following paper : # https://editorialexpress.com/cgi-bin/conference/download.cgi?db_name=SILC2016&paper_id=38 # In[90]: # Calculating Mean Absolute Percentage Error of predictions def calc_mape(predicted_data, true_data):
hits = 0 for seq in validating_sequences: seq_list = [] for inst in seq: prob_list = [] len_list = [] pred_list = [] if len(seq_list) > 1: for i in range(8): seq_list.append(i) a = np.array(seq_list).reshape(-1,1) len_list = [] len_list.append(len(seq_list)) b = np.array(len_list) prob = model.decode(a , lengths = b) prob_list.append(prob[0]) seq_list.pop() max_value = max(prob_list) pred_value = prob_list.index(max_value) if pred_value == inst[0]: hits+=1 count+=1 seq_list.append(inst[0]) else: seq_list.append(inst[0]) len_list.append(len(seq_list)) print(count)
negative_score = [] negative = [] for i in range(data.shape[0]): if data['vdjdb.score'].iloc[i] == 0: negative.append(data['cdr3.alpha'].iloc[i]) for i in range(len(negative)): test = string2matrix_plain(negative[i]).astype(np.int) score = model.score(test) negative_score.append(score) negative_score = np.array(negative_score) import matplotlib.pyplot as plt fig, ax = plt.subplots() ax.boxplot([train_score, test_score, negative_score], positions=[1, 2, 3], labels=['training', 'testing_pos', 'testing_neg']) x_train = np.random.normal(1, 0.02, size=len(train_score)) ax.plot(x_train, train_score, 'r.', alpha=0.2) x_test = np.random.normal(2, 0.02, size=len(test_score)) ax.plot(x_test, test_score, 'b.', alpha=0.2) x_negative = np.random.normal(3, 0.02, size=len(negative_score)) ax.plot(x_negative, negative_score, 'k.', alpha=0.2) ax.set_title('Observation score from HMM -- "GILGFVFTL"--TCRA') import scipy.stats as sc sc.ttest_ind(test_score, negative_score) model.predict(test1) model.predict_proba(test1) model.decode(test1)
d = _data_on_mouse(data, idx, smoothing_time_radius, smoothing_amplitude_radius, smoothing_tolerance, sampling_interval, bins) discrete_obs.append(d[0]) delta_hws.append(d[1]) delta_fas.append(d[2]) X = np.array(discrete_obs) model = MultinomialHMM(n_components = n_components) predictions = [] for i in range(7): held_out_X = np.vstack((X[:i], X[i+1:])) model.fit(held_out_X) predictions.append(model.decode(X[i].reshape(X[i].shape[0], 1))) f, axarr = plt.subplots(7, 1) yranges = np.arange(n_components+1, dtype=float)/n_components colors = plt.cm.rainbow(np.linspace(0, 1, n_components)) for i in range(7): states, indices = _axvspan_maker(predictions[i][1]) for s, idxs in zip(states, indices): axarr[i].axvspan(idxs[0], idxs[1], ymin=yranges[s], ymax=yranges[s+1], color=colors[s]) plt.show() # healthy_model = MultinomialHMM(n_components = n_components) # healthy_model.fit(dos) # hs_preds = healthy_model.predict(dos.reshape(len(dos), 1))
for idx in mice: d = _data_on_mouse(data, idx, smoothing_time_radius, smoothing_amplitude_radius, smoothing_tolerance, sampling_interval, bins) discrete_obs.append(d[0]) delta_hws.append(d[1]) delta_fas.append(d[2]) X = np.array(discrete_obs) model = MultinomialHMM(n_components=n_components) predictions = [] for i in range(7): held_out_X = np.vstack((X[:i], X[i + 1:])) model.fit(held_out_X) predictions.append(model.decode(X[i].reshape(X[i].shape[0], 1))) f, axarr = plt.subplots(7, 1) yranges = np.arange(n_components + 1, dtype=float) / n_components colors = plt.cm.rainbow(np.linspace(0, 1, n_components)) for i in range(7): states, indices = _axvspan_maker(predictions[i][1]) for s, idxs in zip(states, indices): axarr[i].axvspan(idxs[0], idxs[1], ymin=yranges[s], ymax=yranges[s + 1], color=colors[s]) plt.show() # healthy_model = MultinomialHMM(n_components = n_components)
class HMM: def __init__(self): pass def train(self, obs_seq_list: list, state_seq_list: list, obs_set: list, state_set: list, file): """ :param obs_seq_list: observation sequence list [[o1, o2, o3], [o1, o2, o3]...] :param state_seq_list: state sequence list [[s1, s2, s3], [s1, s2, s3]...] :param obs_set: all possible observation state :param state_set: all possible state """ self.obs_seq_list = obs_seq_list self.state_seq_list = state_seq_list self.obs_set = obs_set self.state_set = state_set self.counter = Counter(''.join(state_seq_list)) self.hmm = MultinomialHMM(n_components=len(self.state_set)) self.startprob, self.transmat, self.emissionprob = \ self._init_state(), self._trans_state(), self._emit_state() self.hmm.startprob_ = self.startprob self.hmm.transmat_ = self.transmat self.hmm.emissionprob_ = self.emissionprob if file is not None: with open(file, 'wb') as f: pickle.dump(self, f) @staticmethod def load_model(file: str = None): with open(file, 'rb') as f: return pickle.load(f) def predict(self, obs): obs_seq = np.array([self.preprocess(o) for o in obs]) _, b = self.hmm.decode(obs_seq, algorithm='viterbi') states = [self.state_set[x] for x in b] return states """Methods calculate startprob, transmat and emissionprob""" def _init_state(self): """calculate init state""" first_states = [s[0] for s in self.state_seq_list] cnt = Counter(first_states) seq_amount = len(first_states) # init_state = {k: log((v+1)/words_count) for k, v in init_counts.items()} # plus one smooth init_state = [(cnt[s] + 1) / seq_amount for s in self.state_set] return np.array(init_state) def _trans_state(self): """calculate trans state""" end_state_cnt = {state: 0 for state in self.state_set} # trans_cnt[start_state][end_state] trans_cnt = {state: dict(end_state_cnt) for state in self.state_set} for line in self.state_seq_list: for w1, w2 in zip(line, line[1:]): trans_cnt[w1][w2] += 1.0 # trans_state = {k: {kk: log((vv+1)/counter[k]) for kk, vv in v.items()} for k, v in trans_counts.items()} trans_matrix = [[ (trans_cnt[start_s][end_s] + 1) / self.counter[start_s] for end_s in self.state_set ] for start_s in self.state_set] return np.array(trans_matrix) def _emit_state(self): """calculate emit state""" obs_dict = {word: 0.0 for word in self.obs_set} emit_cnt = {state: dict(obs_dict) for state in self.state_set} for state_seq, obs_seq in zip(self.state_seq_list, self.obs_seq_list): for state, obs in zip(state_seq, obs_seq): emit_cnt[state][obs] += 1 # emit_state = {k: {kk: log((vv+1)/counter[k]) for kk, vv in v.items()} for k, v in emit_counts.items()} emit_matrix = [[(emit_cnt[s][o] + 1) / self.counter[s] for o in self.obs_set] for s in self.state_set] return np.array(emit_matrix) def preprocess(self, seq: list): """handle new observation""" return [ self.obs_set.index(obs) if obs in self.obs_set else len(self.obs_set) - 1 for obs in seq ]