def detect_events_hmm(mahal_timeseries, c_timeseries, global_pace_timeseries, threshold_quant=.95):
    #Sort the keys of the timeseries chronologically    
    sorted_dates = sorted(mahal_timeseries)
    
    
    (expected_pace_timeseries, sd_pace_timeseries) = getExpectedPace(global_pace_timeseries)    

    #Generate the list of values of R(t)
    mahal_list = [mahal_timeseries[d] for d in sorted_dates]
    c_list = [c_timeseries[d] for d in sorted_dates]
    global_pace_list = [global_pace_timeseries[d] for d in sorted_dates]
    expected_pace_list = [expected_pace_timeseries[d] for d in sorted_dates]

    
    #Use the quantile to determine the threshold
    sorted_mahal = sorted(mahal_list)
    threshold = getQuantile(sorted_mahal, threshold_quant)
    
    
    # The symbols array contains "1" if there is an outlier, "0" if there is not
    symbols = []
    for i in range(len(mahal_list)):
        if(mahal_list[i] > threshold or c_list[i]==1):
            symbols.append(1)
        else:
            symbols.append(0)
    
    
    # Set up the hidden markov model.  We are modeling the non-event states as "0"
    # and event states as "1"
    
    # Transition matrix with heavy weight on the diagonals ensures that the model
    # is likely to stick in the same state rather than rapidly switching.  In other
    # words, the predictions will be relatively "smooth"
    trans_matrix = array([[.999, .001],
                      [.001,.999]])

    # Emission matrix - state 0 is likely to emit symbol 0, and vice versa
    # In other words, events are likely to be outliers
    emission_matrix = array([[.95, .05],
                             [.4, .6]])
    
    # Actually set up the hmm
    model = MultinomialHMM(n_components=2, transmat=trans_matrix)
    model.emissionprob_ = emission_matrix
    
    # Make the predictions
    lnl, predictions = model.decode(symbols)
    
    events = get_all_events(predictions, sorted_dates, mahal_list, global_pace_list,
                            expected_pace_list)
    
    # Sort events by duration, starting with the long events
    events.sort(key = lambda x: x[2], reverse=True)
    return events, predictions
def run_hmm_model(input_df, n_unique, A_df, Eta, n_iter = 10000, 
                        tol=1e-2, verbose = False, params = 'e', init_params = ''):
    '''
        Runs the hmm model and returns the predicted results, score and model 

            input_df : The dataframe of keypresses 

            n_unique : number of unqique chars 


            A_df : Dataframe of trasnmission matrix 

            Eta : Emissions matrix 

            n_iter : Max number of iterations for hmm

            tol : The value to stop the hmm model if score does not improve by more than this 

            verbose : Whether or not to print out 

            params : Parameters to tune 

            init_params : Paramters to initialize
    '''
    # Propotion of characters starting words in english 
    char_counts = get_char_counts()

    # Construct model 
    hmm = MultinomialHMM(n_components=n_unique, startprob_prior=np.append(0, char_counts.values), 
               transmat_prior=A_df.values, algorithm='viterbi', 
               random_state=None, n_iter=n_iter, tol=tol, 
               verbose=verbose, params=params, init_params=init_params)
    
    # Set values 
    hmm.emissionprob_ = Eta
    hmm.transmat_ = A_df.values
    hmm.startprob_ = np.append(0, char_counts.values)

    # Feed in the clusters as the expected output
    model_input = input_df['cluster'].values
    
    # Reshape    
    if len(model_input.shape) == 1:
        model_input = model_input.reshape((len(model_input), 1))
    
    # Fit the model
    hmm = hmm.fit(model_input)

    # Score model
    score, results = hmm.decode(model_input)

    return score, results, hmm  
def detect_events_hmm(mahal_timeseries, c_timeseries, global_pace_timeseries,
                      threshold_quant=.95, trans_matrix = DEFAULT_TRANS_MATRIX,
                      emission_matrix=DEFAULT_EMISSION_MATRIX, initial_state=None):
            
    #Sort the keys of the timeseries chronologically    
    sorted_dates = sorted(mahal_timeseries)
    
    
    (expected_pace_timeseries, sd_pace_timeseries) = getExpectedPace(global_pace_timeseries)    

    #Generate the list of values of R(t)
    mahal_list = [mahal_timeseries[d] for d in sorted_dates]
    c_list = [c_timeseries[d] for d in sorted_dates]
    global_pace_list = [global_pace_timeseries[d] for d in sorted_dates]
    expected_pace_list = [expected_pace_timeseries[d] for d in sorted_dates]

    
    #Use the quantile to determine the threshold
    sorted_mahal = sorted(mahal_list)
    threshold = getQuantile(sorted_mahal, threshold_quant)
    
    
    # The symbols array contains "1" if there is an outlier, "0" if there is not
    symbols = []
    for i in range(len(mahal_list)):
        if(mahal_list[i] > threshold or c_list[i]==1):
            symbols.append(1)
        else:
            symbols.append(0)
    
    

  
    
    # Actually set up the hmm
    model = MultinomialHMM(n_components=2, transmat=trans_matrix, startprob=initial_state)
    model.emissionprob_ = emission_matrix
    
    # Make the predictions
    lnl, predictions = model.decode(symbols)
    
    events = get_all_events(predictions, sorted_dates, mahal_list, global_pace_list,
                            expected_pace_list)
    
    # Sort events by duration, starting with the long events
    events.sort(key = lambda x: x[2], reverse=True)
    return events, predictions
Esempio n. 4
0
def detect_events_hmm(mahal_timeseries,
                      c_timeseries,
                      global_pace_timeseries,
                      threshold_quant=.95,
                      trans_matrix=DEFAULT_TRANS_MATRIX,
                      emission_matrix=DEFAULT_EMISSION_MATRIX,
                      initial_state=None):

    #Sort the keys of the timeseries chronologically
    sorted_dates = sorted(mahal_timeseries)

    (expected_pace_timeseries,
     sd_pace_timeseries) = getExpectedPace(global_pace_timeseries)

    #Generate the list of values of R(t)
    mahal_list = [mahal_timeseries[d] for d in sorted_dates]
    c_list = [c_timeseries[d] for d in sorted_dates]
    global_pace_list = [global_pace_timeseries[d] for d in sorted_dates]
    expected_pace_list = [expected_pace_timeseries[d] for d in sorted_dates]

    #Use the quantile to determine the threshold
    sorted_mahal = sorted(mahal_list)
    threshold = getQuantile(sorted_mahal, threshold_quant)

    # The symbols array contains "1" if there is an outlier, "0" if there is not
    symbols = []
    for i in range(len(mahal_list)):
        if (mahal_list[i] > threshold or c_list[i] == 1):
            symbols.append(1)
        else:
            symbols.append(0)

    # Actually set up the hmm
    model = MultinomialHMM(n_components=2,
                           transmat=trans_matrix,
                           startprob=initial_state)
    model.emissionprob_ = emission_matrix

    # Make the predictions
    lnl, predictions = model.decode(symbols)

    events = get_all_events(predictions, sorted_dates, mahal_list,
                            global_pace_list, expected_pace_list)

    # Sort events by duration, starting with the long events
    events.sort(key=lambda x: x[2], reverse=True)
    return events, predictions
def test_DiscreteHMM_decode(cases: str) -> None:
    np.random.seed(12346)
    cases = int(cases)
    i = 1
    N_decimal = 4
    while i < cases:
        tol=1e-3
        n_samples = np.random.randint(10, 50)
        hidden_states = np.random.randint(3, 6)
        # symbols is the number of unqiue observation types.
        symbols = np.random.randint(4, 9)
        X = []
        lengths = []
        for _ in range(n_samples):
            # the actual length is seq_length + 1
            seq_length = symbols
            this_x = np.random.choice(range(symbols), size=seq_length, replace=False)
            X.append(this_x)
            lengths.append(seq_length)
        max_iter = 100


        hmm_gold = MultinomialHMM(n_components=hidden_states, n_iter=100, tol=tol)
        X_gold = np.concatenate(X).reshape((-1,1))
        hmm_gold.fit(X_gold, lengths)
        gold_A = hmm_gold.transmat_
        gold_B = hmm_gold.emissionprob_
        gold_pi = hmm_gold.startprob_
        gold_logprob, gold_state_sequence = hmm_gold.decode(X_gold, lengths)
        hmm_mine = DiscreteHMM(hidden_states=hidden_states,
                               symbols=symbols,
                               A=gold_A,
                               B=gold_B,
                               pi=gold_pi)
        mine_logprob_list = []
        mine_state_sequence = []
        for this_x in X:
            this_mine_logprob, this_mine_state_sequence = hmm_mine.decode(this_x)
            mine_logprob_list.append(this_mine_logprob)
            mine_state_sequence.append(this_mine_state_sequence)
        mine_state_sequence = np.concatenate(mine_state_sequence)
        mine_logprob = sum(mine_logprob_list)
        assert_almost_equal(mine_logprob, gold_logprob, decimal=N_decimal)
        assert_almost_equal(mine_state_sequence, gold_state_sequence, decimal=N_decimal)
        i+=1
    print('Successfully testing the function of computing decodes in discrete HMM!')
            emission_probability[1][2] += 1
        if train_data['return'][i] < 0.0 and analysis_data['v'][i] == 3:
            emission_probability[1][3] += 1
    emission_probability[0] /= sum(1 for e in train_data['return'] if e >= 0.0)
    emission_probability[1] /= sum(1 for e in train_data['return'] if e < 0.0)
    #print(emission_probability)


    hmm = MultinomialHMM(n_components=n_states)
    hmm.startprob = start_probability
    hmm.transmat = transition_probability
    hmm.emissionprob = emission_probability

    bob_says = np.array([[0, 2, 1, 1, 2, 0]]).T
    hmm = hmm.fit(bob_says)

    logprob, alice_hears = hmm.decode(bob_says, algorithm="viterbi")
    print("Bob says:", ", ".join(map(lambda x: observations[x], bob_says)))
    print("Alice hears:", ", ".join(map(lambda x: states[x], alice_hears)))



    '''
    law_data['hmm_states'] = hmm.predict(rets)
    panel = Figure_Util.Figure()
    panel.draw(law_data, title='close', subplots=['hmm_states'], figsize=(20, 10))
    '''

    db.disconnect()

Esempio n. 7
0
        for sent in test_sents:
            inp = []
            for i in range(len(sent)):
                word = sent[i][0]
                try:
                    k = list(emission_dict.keys()).index(word)
                except:
                    nexcept += 1
                    k = emission_matrix.shape[0] - 1

                inp.append(k)

            inp = np.atleast_2d(inp).T

            if (len(inp) != 1):
                logprob, out_ = model.decode(inp, algorithm='viterbi')
                out_ = list(map(lambda x: states[x], out_))

            else:
                #print(sent[0][-1])
                out_ = []
                out_.append('O')

            for i in range(len(sent)):
                word = sent[i][0]
                gold = sent[i][-1]
                pred = out_[i]
                out.write("{}\t{}\t{}\n".format(word, gold, pred))
            #j+=1
        out.write("\n")
    print(nexcept)
Esempio n. 8
0
model.startprob_ = np.array([1, 0, 0])
model.endprob_ = np.array([0, 0, 0.1])

model.transmat_ = np.array([[0.9, 0.1, 0], [0, 0, 1], [0, 0, 1]])
model.emissionprob_ = np.array([[0.25, 0.25, 0.25, 0.25], [0.05, 0, 0.95, 0],
                                [0.4, 0.1, 0.1, 0.4]])

# In[121]:

#"CTTCATGTGAAAGCAGACGTAAGTCA" A = 0 , C = 1 , G = 2 , T = 3
sequence = [
    1, 3, 3, 1, 0, 3, 2, 3, 2, 0, 0, 0, 2, 1, 0, 2, 0, 1, 2, 3, 0, 0, 2, 3, 1,
    0
]

logprob, seq = model.decode(np.array([sequence]).transpose())
print(logprob)
print(seq)
# E = 0 ,  5 = 1 , I = 2
print("following sequence correspond to :")
print("EEEEEEEEEEEEEEEEEE5IIIIIII")

# ##  Question 1.2  : HMM
# #### based on the following paper :
# https://editorialexpress.com/cgi-bin/conference/download.cgi?db_name=SILC2016&paper_id=38

# In[90]:


# Calculating Mean Absolute Percentage Error of predictions
def calc_mape(predicted_data, true_data):
Esempio n. 9
0
hits = 0
for seq in validating_sequences:
    seq_list = []      
    for inst in seq:
        prob_list = []
        len_list = []
        pred_list = []
        
        if len(seq_list) > 1:
            for i in range(8):
                seq_list.append(i)
                a = np.array(seq_list).reshape(-1,1)
                len_list = []
                len_list.append(len(seq_list))
                b = np.array(len_list)
                prob = model.decode(a , lengths = b)
                prob_list.append(prob[0])
                seq_list.pop()
            max_value = max(prob_list)
            pred_value = prob_list.index(max_value)
            if pred_value == inst[0]:
                hits+=1
            count+=1
            seq_list.append(inst[0])
            
        else:
            seq_list.append(inst[0])
            len_list.append(len(seq_list))
        

print(count)
Esempio n. 10
0
    negative_score = []
    negative = []
    for i in range(data.shape[0]):
        if data['vdjdb.score'].iloc[i] == 0:
            negative.append(data['cdr3.alpha'].iloc[i])
    for i in range(len(negative)):
        test = string2matrix_plain(negative[i]).astype(np.int)
        score = model.score(test)
        negative_score.append(score)
    negative_score = np.array(negative_score)

    import matplotlib.pyplot as plt
    fig, ax = plt.subplots()
    ax.boxplot([train_score, test_score, negative_score],
               positions=[1, 2, 3],
               labels=['training', 'testing_pos', 'testing_neg'])
    x_train = np.random.normal(1, 0.02, size=len(train_score))
    ax.plot(x_train, train_score, 'r.', alpha=0.2)
    x_test = np.random.normal(2, 0.02, size=len(test_score))
    ax.plot(x_test, test_score, 'b.', alpha=0.2)
    x_negative = np.random.normal(3, 0.02, size=len(negative_score))
    ax.plot(x_negative, negative_score, 'k.', alpha=0.2)
    ax.set_title('Observation score from HMM -- "GILGFVFTL"--TCRA')

    import scipy.stats as sc
    sc.ttest_ind(test_score, negative_score)

    model.predict(test1)
    model.predict_proba(test1)
    model.decode(test1)
Esempio n. 11
0
    d = _data_on_mouse(data, idx, smoothing_time_radius,
                       smoothing_amplitude_radius, smoothing_tolerance, 
                       sampling_interval, bins)
    discrete_obs.append(d[0])
    delta_hws.append(d[1])
    delta_fas.append(d[2])

X = np.array(discrete_obs)


model = MultinomialHMM(n_components = n_components)
predictions = []
for i in range(7):
    held_out_X = np.vstack((X[:i], X[i+1:]))
    model.fit(held_out_X)
    predictions.append(model.decode(X[i].reshape(X[i].shape[0], 1)))

f, axarr = plt.subplots(7, 1)
yranges = np.arange(n_components+1, dtype=float)/n_components
colors = plt.cm.rainbow(np.linspace(0, 1, n_components))
for i in range(7):
    states, indices = _axvspan_maker(predictions[i][1])
    for s, idxs in zip(states, indices): 
        axarr[i].axvspan(idxs[0], idxs[1], ymin=yranges[s], ymax=yranges[s+1], color=colors[s])
plt.show()

# healthy_model = MultinomialHMM(n_components = n_components)
# healthy_model.fit(dos)
# hs_preds = healthy_model.predict(dos.reshape(len(dos), 1))

Esempio n. 12
0
for idx in mice:
    d = _data_on_mouse(data, idx, smoothing_time_radius,
                       smoothing_amplitude_radius, smoothing_tolerance,
                       sampling_interval, bins)
    discrete_obs.append(d[0])
    delta_hws.append(d[1])
    delta_fas.append(d[2])

X = np.array(discrete_obs)

model = MultinomialHMM(n_components=n_components)
predictions = []
for i in range(7):
    held_out_X = np.vstack((X[:i], X[i + 1:]))
    model.fit(held_out_X)
    predictions.append(model.decode(X[i].reshape(X[i].shape[0], 1)))

f, axarr = plt.subplots(7, 1)
yranges = np.arange(n_components + 1, dtype=float) / n_components
colors = plt.cm.rainbow(np.linspace(0, 1, n_components))
for i in range(7):
    states, indices = _axvspan_maker(predictions[i][1])
    for s, idxs in zip(states, indices):
        axarr[i].axvspan(idxs[0],
                         idxs[1],
                         ymin=yranges[s],
                         ymax=yranges[s + 1],
                         color=colors[s])
plt.show()

# healthy_model = MultinomialHMM(n_components = n_components)
Esempio n. 13
0
class HMM:
    def __init__(self):
        pass

    def train(self, obs_seq_list: list, state_seq_list: list, obs_set: list,
              state_set: list, file):
        """
        :param obs_seq_list: observation sequence list [[o1, o2, o3], [o1, o2, o3]...]
        :param state_seq_list: state sequence list [[s1, s2, s3], [s1, s2, s3]...]
        :param obs_set: all possible observation state
        :param state_set: all possible state
        """
        self.obs_seq_list = obs_seq_list
        self.state_seq_list = state_seq_list
        self.obs_set = obs_set
        self.state_set = state_set
        self.counter = Counter(''.join(state_seq_list))

        self.hmm = MultinomialHMM(n_components=len(self.state_set))

        self.startprob, self.transmat, self.emissionprob = \
            self._init_state(), self._trans_state(), self._emit_state()
        self.hmm.startprob_ = self.startprob
        self.hmm.transmat_ = self.transmat
        self.hmm.emissionprob_ = self.emissionprob

        if file is not None:
            with open(file, 'wb') as f:
                pickle.dump(self, f)

    @staticmethod
    def load_model(file: str = None):
        with open(file, 'rb') as f:
            return pickle.load(f)

    def predict(self, obs):
        obs_seq = np.array([self.preprocess(o) for o in obs])
        _, b = self.hmm.decode(obs_seq, algorithm='viterbi')
        states = [self.state_set[x] for x in b]
        return states

    """Methods calculate startprob, transmat and emissionprob"""

    def _init_state(self):
        """calculate init state"""
        first_states = [s[0] for s in self.state_seq_list]
        cnt = Counter(first_states)
        seq_amount = len(first_states)
        # init_state = {k: log((v+1)/words_count) for k, v in init_counts.items()}
        # plus one smooth
        init_state = [(cnt[s] + 1) / seq_amount for s in self.state_set]
        return np.array(init_state)

    def _trans_state(self):
        """calculate trans state"""
        end_state_cnt = {state: 0 for state in self.state_set}
        # trans_cnt[start_state][end_state]
        trans_cnt = {state: dict(end_state_cnt) for state in self.state_set}
        for line in self.state_seq_list:
            for w1, w2 in zip(line, line[1:]):
                trans_cnt[w1][w2] += 1.0
        # trans_state = {k: {kk: log((vv+1)/counter[k]) for kk, vv in v.items()} for k, v in trans_counts.items()}
        trans_matrix = [[
            (trans_cnt[start_s][end_s] + 1) / self.counter[start_s]
            for end_s in self.state_set
        ] for start_s in self.state_set]
        return np.array(trans_matrix)

    def _emit_state(self):
        """calculate emit state"""
        obs_dict = {word: 0.0 for word in self.obs_set}
        emit_cnt = {state: dict(obs_dict) for state in self.state_set}
        for state_seq, obs_seq in zip(self.state_seq_list, self.obs_seq_list):
            for state, obs in zip(state_seq, obs_seq):
                emit_cnt[state][obs] += 1
        # emit_state = {k: {kk: log((vv+1)/counter[k]) for kk, vv in v.items()} for k, v in emit_counts.items()}

        emit_matrix = [[(emit_cnt[s][o] + 1) / self.counter[s]
                        for o in self.obs_set] for s in self.state_set]
        return np.array(emit_matrix)

    def preprocess(self, seq: list):
        """handle new observation"""
        return [
            self.obs_set.index(obs)
            if obs in self.obs_set else len(self.obs_set) - 1 for obs in seq
        ]