def __init__(self, model_file=None, components=None): if os.path.exists(model_file): self.model = joblib.load(model_file) else: alu_file = 'Alu_sequence.pkl' if os.path.exists(alu_file): locis = joblib.load(alu_file) else: locis = read_sequence('hg19_Alu.bed', 0) locis = random.sample(locis, 100000) for l in tqdm(locis): l.init_seq() l.decode_seq() locis = list(filter(lambda l: l.seq is not None, locis)) joblib.dump(locis, alu_file) print('Alu Loaded') locis = locis[0:5000] model = MultinomialHMM(n_components=components, verbose=True, n_iter=50) x = np.concatenate(list(map(attrgetter('seq'), locis))) x = np.reshape(x, [x.shape[0], 1]) length = list(map(attrgetter('length'), locis)) model.fit(x, length) self.model = model joblib.dump(self.model, model_file)
def main(): rand_p_matrix = np.random.rand(4, 4) rand_b_matrix = np.random.rand(4, 3) print("\nGernerating p matrix...............") p_matrix = normalization(rand_p_matrix) print(p_matrix) print("\nGernerating b matrix...............") b_matrix = normalization(rand_b_matrix) print(b_matrix) # Generate 1000 observations O, _ = generate_observation(1000, p_matrix, b_matrix) # training the selection of number of states aic = [] bic = [] likelihood = [] m = 3 print("\nTraining the HMM for selection of number of states........") for n in range(2, 30): observations = LabelEncoder().fit_transform(O) model = MultinomialHMM(n_components=n, random_state=200263453) model.fit(np.atleast_2d(observations)) logL = model.score(np.atleast_2d(observations)) p = compute_p(n, m) a = AIC(logL, p) b = BIC(logL, observations, p) likelihood.append(logL) aic.append(a) bic.append(b) plot(aic, 'AIC') plot(bic, 'BIC') plot(likelihood, 'Log likelihood')
def fit_hmm_learn(X, n_states): samples = np.concatenate(X) lengths = [len(x) for x in X] hmm_learn_model = MultinomialHMM(n_components=n_states) hmm_learn_model.fit(samples, lengths) # Label data using hmmlearn model return hmm_learn_model.predict(samples, lengths)
def train_hmm(): """ HMM for sequence learning. """ print "Loading training data..." train_sequence, num_classes = get_sequence("./train_data/*") print "Build HMM..." model = MultinomialHMM(n_components=2) print "Train HMM..." model.fit([train_sequence])
class BKT: """ Implements the Bayesian Knowledge Tracing model. This only implements the Viterbi and EM algorithms. These may be used together to implement an Intelligent Tutoring System. """ def __init__(self, observed): """ Initializes the object and sets the internal state. Args: observed: array-like, shape (n_samples, n_features) """ self.observed = np.array(observed) if len(self.observed.shape) == 1: self.observed = self.observed.reshape(-1, 1) # TODO: Check other parameters to this constructor self.model = MultinomialHMM(n_components=2, n_iter=100) def fit(self) -> None: """ Fits the model to the observed states. Uses the EM algorithm to estimate model parameters. """ self.model.fit(self.observed) def get_model_params(self) -> tuple: """ Returns the model parameters. This must be run only after calling the `fit` function. Returns: (A, pi, B): The start probabilities, the transition probabilities, and the emission probabilities. """ return np.round_(self.model.startprob_, 2), np.round_(self.model.transmat_, 2), \ np.round_(self.model.emissionprob_, 2) def predict(self, sequence) -> np.array: """ Returns the most likely hidden state sequence corresponding to `sequence`. Args: sequence: List of observable states Returns: state_sequence: Array """ return self.model.predict(sequence)
def test_DiscreteHMM_decode(cases: str) -> None: np.random.seed(12346) cases = int(cases) i = 1 N_decimal = 4 while i < cases: tol=1e-3 n_samples = np.random.randint(10, 50) hidden_states = np.random.randint(3, 6) # symbols is the number of unqiue observation types. symbols = np.random.randint(4, 9) X = [] lengths = [] for _ in range(n_samples): # the actual length is seq_length + 1 seq_length = symbols this_x = np.random.choice(range(symbols), size=seq_length, replace=False) X.append(this_x) lengths.append(seq_length) max_iter = 100 hmm_gold = MultinomialHMM(n_components=hidden_states, n_iter=100, tol=tol) X_gold = np.concatenate(X).reshape((-1,1)) hmm_gold.fit(X_gold, lengths) gold_A = hmm_gold.transmat_ gold_B = hmm_gold.emissionprob_ gold_pi = hmm_gold.startprob_ gold_logprob, gold_state_sequence = hmm_gold.decode(X_gold, lengths) hmm_mine = DiscreteHMM(hidden_states=hidden_states, symbols=symbols, A=gold_A, B=gold_B, pi=gold_pi) mine_logprob_list = [] mine_state_sequence = [] for this_x in X: this_mine_logprob, this_mine_state_sequence = hmm_mine.decode(this_x) mine_logprob_list.append(this_mine_logprob) mine_state_sequence.append(this_mine_state_sequence) mine_state_sequence = np.concatenate(mine_state_sequence) mine_logprob = sum(mine_logprob_list) assert_almost_equal(mine_logprob, gold_logprob, decimal=N_decimal) assert_almost_equal(mine_state_sequence, gold_state_sequence, decimal=N_decimal) i+=1 print('Successfully testing the function of computing decodes in discrete HMM!')
def main(): rand_p_matrix = np.random.rand(4, 4) rand_b_matrix = np.random.rand(4, 3) print("\nGernerating p matrix...............") p_matrix = normalization(rand_p_matrix) print(p_matrix) print("\nGernerating b matrix...............") b_matrix = normalization(rand_b_matrix) print(b_matrix) # Generate 1000 observations O, Q = generate_observation(1000, p_matrix, b_matrix) O_seq = [1, 2, 3, 3, 1, 2, 3, 3, 1, 2, 3, 3] pi = (1, 0, 0, 0) print("\nThe Orginal Observation Sequence O: {}".format(O[:12])) print("The probability 𝑝(𝑂|𝜆) is {} with O: {}".format( forward(O_seq, p_matrix, b_matrix, pi)[-1].sum(), O_seq)) print("\nThe Orginal Sequence Q: {}".format(Q[:12])) print("The Most Probable Sequence Q: {} with O: {}".format( list(viterbi(O_seq, p_matrix, b_matrix, pi)), O_seq)) obersvations = LabelEncoder().fit_transform(O) model = MultinomialHMM(n_components=4) model.fit(np.atleast_2d(obersvations)) est_pi = model.startprob_ est_p = model.transmat_ est_b = model.emissionprob_ print("\nThe estimated transition matrix P:\n {}".format(est_p)) print("\nThe estimated event matrix B:\n {}".format(est_b)) print("\nThe estimated start probability pi:\n {}".format(est_pi)) _, p = chisquare(p_matrix, est_p, axis=None) print("\np-value of transition matrix P: {}".format(p)) _, p = chisquare(b_matrix, est_b, axis=None) print("p-value of event matrix B: {}".format(p)) _, p = chisquare(pi, est_pi, axis=None) print("p-value of start probability pi: {}".format(p))
def run_hmm_model(input_df, n_unique, A_df, Eta, n_iter = 10000, tol=1e-2, verbose = False, params = 'e', init_params = ''): ''' Runs the hmm model and returns the predicted results, score and model input_df : The dataframe of keypresses n_unique : number of unqique chars A_df : Dataframe of trasnmission matrix Eta : Emissions matrix n_iter : Max number of iterations for hmm tol : The value to stop the hmm model if score does not improve by more than this verbose : Whether or not to print out params : Parameters to tune init_params : Paramters to initialize ''' # Propotion of characters starting words in english char_counts = get_char_counts() # Construct model hmm = MultinomialHMM(n_components=n_unique, startprob_prior=np.append(0, char_counts.values), transmat_prior=A_df.values, algorithm='viterbi', random_state=None, n_iter=n_iter, tol=tol, verbose=verbose, params=params, init_params=init_params) # Set values hmm.emissionprob_ = Eta hmm.transmat_ = A_df.values hmm.startprob_ = np.append(0, char_counts.values) # Feed in the clusters as the expected output model_input = input_df['cluster'].values # Reshape if len(model_input.shape) == 1: model_input = model_input.reshape((len(model_input), 1)) # Fit the model hmm = hmm.fit(model_input) # Score model score, results = hmm.decode(model_input) return score, results, hmm
def predict(self, day_to_predict): # Get records of 30 days before day_to_predict previous_thirty_days = get_previous_month(self.time_series, day_to_predict) binary_crime_sequence = previous_thirty_days['Violent Crime Committed?'].values.tolist() # Unsupervised HMM can't account for string of identical emissions. # If we see such a string, just predict the same emission for the following day. if binary_crime_sequence == [1]*30: return True if binary_crime_sequence == [0]*30: return False votes = [] # Train nine HMMs. They are initialized randomly, so we take "votes" from nine HMMs. # Why 9? Odd numbers preclude ties. # And nine is a decent tradeoff between performance and getting bad results by chance for _ in range(3): # Train HMM model = MultinomialHMM(n_components=3, n_iter=10000) model.fit([np.array(binary_crime_sequence)]) # Determine the most likely state of the last day in the sequence last_state_probs = model.predict_proba(binary_crime_sequence)[-1] current_state = self.get_most_likely(last_state_probs) # Determine the most likely state of the day we're trying to predict transition_probs = model.transmat_[current_state] next_state = self.get_most_likely(transition_probs) # Determine the most likely emission (crime/no crime) from a day in that state emissions = model.emissionprob_[next_state] vote = self.get_most_likely(emissions) # Record this HMM's vote votes.append(vote) # Votes are 1 for crime, 0 for no crime. Return True if majority votes for crime. return sum(votes) > 1
def test_HMM(): np.random.seed(12345) np.set_printoptions(precision=5, suppress=True) P = default_hmm() ls, obs = P["latent_states"], P["obs_types"] # generate a new sequence O = generate_training_data(P, n_steps=30, n_examples=25) tol = 1e-5 n_runs = 5 best, best_theirs = (-np.inf, []), (-np.inf, []) for _ in range(n_runs): hmm = MultinomialHMM() A_, B_, pi_ = hmm.fit(O, ls, obs, tol=tol, verbose=True) theirs = MHMM( tol=tol, verbose=True, n_iter=int(1e9), transmat_prior=1, startprob_prior=1, algorithm="viterbi", n_components=len(ls), ) O_flat = O.reshape(1, -1).flatten().reshape(-1, 1) theirs = theirs.fit(O_flat, lengths=[O.shape[1]] * O.shape[0]) hmm2 = MultinomialHMM(A=A_, B=B_, pi=pi_) like = np.sum([hmm2.log_likelihood(obs) for obs in O]) like_theirs = theirs.score(O_flat, lengths=[O.shape[1]] * O.shape[0]) if like > best[0]: best = (like, {"A": A_, "B": B_, "pi": pi_}) if like_theirs > best_theirs[0]: best_theirs = ( like_theirs, { "A": theirs.transmat_, "B": theirs.emissionprob_, "pi": theirs.startprob_, }, ) print("Final log likelihood of sequence: {:.5f}".format(best[0])) print("Final log likelihood of sequence (theirs): {:.5f}".format( best_theirs[0])) plot_matrices(P, best, best_theirs)
class HMM_Learner: def __init__(self, M): self.con = MultinomialHMM(n_components=M) self.incon = MultinomialHMM(n_components=M) self.daID = { 'ass': 0, 'bck': 1, 'be.neg': 2, 'be.pos': 3, 'el.ass': 4, 'el.inf': 5, 'el.sug': 6, 'el.und': 7, 'fra': 8, 'inf': 9, 'off': 10, 'oth': 11, 'stl': 12, 'sug': 13, 'und': 14 } self.da_choose_n = itertools.combinations([ 'ass', 'bck', 'be.neg', 'be.pos', 'el.ass', 'el.inf', 'el.sug', 'el.und', 'fra', 'inf', 'off', 'oth', 'stl', 'sug', 'und' ], 4) def addRandomAllSequence(self, X, lengths): da_keys = self.daID.keys() random.shuffle(da_keys) X1 = [[self.daID[x.lower().strip()]] for x in da_keys] X.append(X1) lengths.append(len(X1)) def trainHMMs(self, topics, sequences, labels): try: self.con = pickle.load(open('HMM_consistent.model', 'rb')) self.incon = pickle.load(open('HMM_inconsistet.model', 'rb')) except: X_con = [] l_con = [] X_incon = [] l_incon = [] for t in topics: try: temp = sequences[t] temp = labels[t] except: continue if sequences[t]: X1 = [[self.daID[da.lower().strip()]] for da in sequences[t]] if 'weak' in labels[t].lower(): X_incon.append(X1) l_incon.append(len(sequences[t])) else: X_con.append(X1) l_con.append(len(sequences[t])) # Add two complete random sequence to support Multinomial in HMMs self.addRandomAllSequence(X_incon, l_incon) self.addRandomAllSequence(X_con, l_con) self.con.fit(np.concatenate(X_con), l_con) self.incon.fit(np.concatenate(X_incon), l_incon) pickle.dump(self.con, open('HMM_consistent.model', 'wb')) pickle.dump(self.incon, open('HMM_inconsistet.model', 'wb')) def testHMMs(self, topics, sequences): prediction = {} for t in topics: try: temp = sequences[t] except: continue if sequences[t]: X1 = [[self.daID[da.lower().strip()]] for da in sequences[t]] c = self.con.score(np.concatenate([X1]), [len(sequences[t])]) i = self.incon.score(np.concatenate([X1]), [len(sequences[t])]) prediction[t] = (c, i) return prediction def generateLabelSequence(self, sequence): MIN_VAL = -10000000 topics = [] sequences = {} isConsistent = False max_score_da_seq = '' max_score = MIN_VAL for n in xrange(2, 5): da_choose_n = [ p for p in itertools.product([ 'ass', 'bck', 'be.neg', 'be.pos', 'el.ass', 'el.inf', 'el.sug', 'el.und', 'fra', 'inf', 'off', 'oth', 'stl', 'sug', 'und' ], repeat=n) ] #print da_choose_n for s in da_choose_n: temp_sequence = copy.deepcopy(sequence) temp_sequence.extend(list(s)) topics.append(str(s)) sequences[str(s)] = temp_sequence scores = self.testHMMs(topics, sequences) for t in topics: if scores[t][0] > max_score: max_score = scores[t][0] max_score_da_seq = t if scores[t][0] > scores[t][1]: isConsistent = True max_score = scores[t][0] max_score_da_seq = t if isConsistent: break print(max_score, max_score_da_seq) return max_score_da_seq, isConsistent
### preparing the testdata as the list of numbers ## task -3 dummy_data = [] for i in s: if i == ' ': dummy_data.append(26) else: dummy_data.append(ord(i) - ord('A')) training_data = np.array(dummy_data) training_data = training_data.reshape((training_data.shape[0], 1)) ### hmm model hmm_model = MultinomialHMM(n_components=2, n_iter=500, tol=0.01, verbose=False) hmm_model.fit(training_data) print(hmm_model.monitor_) print("Tranisition probalitity of this model is \n") print(hmm_model.transmat_) print("\n") print("Emission probalitity of this model is \n") print(np.transpose(hmm_model.emissionprob_)) print("\n") ## the seven most probable characters transition_prob1 = transition_prob emission_prob1 = np.transpose(hmm_model.emissionprob_) print("For this trained model, the seven most likely charcters are\n") print("For state 0, the seven most likely characters are\n")
class TOE_HMM_CHARS: def __init__(self, N=2, maxIters = 200): self._N = N self._M = COMPONENTS self._maxIters = maxIters self._syms = [] def loadBrownSymsSeq(self, T): taggedWordsIter = brown.tagged_words() retIdx = 0 symSequence = [] for wrd, tag in taggedWordsIter: if wrd: for c in wrd: val = ord(c) symSequence.append(val) retIdx += 1 if retIdx >= T: break self._syms = symSequence self._syms = np.concatenate((self._syms, np.arange(256))).tolist() return symSequence def textSeqToSymSeq(self, txtSeqArr): symSequence =[] for wrd in txtSeqArr: if wrd: for c in wrd: val = ord(c) symSequence.append(val) return symSequence def initHMM(self): self._hmm = MultinomialHMM(n_components=self._N, n_iter=self._maxIters, verbose=True, params='ste', init_params='ste') # n_features (int) Number of possible symbols emitted by the model (in the samples). # monitor_ (ConvergenceMonitor) Monitor object used to check the convergence of EM. # transmat_ (array, shape (n_components, n_components)) Matrix of transition probabilities between states. # startprob_ (array, shape (n_components, )) Initial state occupation distribution. # emissionprob_ (array, shape (n_components, n_features)) Probability of emitting a given symbol when in each state. def trainHMM(self): self._hmm.fit(np.array(self._syms).reshape(-1, 1)) def testTxt(self, txtSeqArr): testSymsArr = self.textSeqToSymSeq(txtSeqArr) score = self._hmm.score(testSymsArr) return score def testSyms(self, symsArr): score = self._hmm.score(symsArr) return score def trainedLambda(self): A = self._hmm.transmat_ B = self._hmm.emissionprob_ pi = self._hmm.startprob_ return (A, B, pi) def persistHMM(self, filename): import pickle s = pickle.dumps(self) with open(filename, 'w') as f: f.write(s) @staticmethod def loadHMM(filename): import pickle with open(filename, 'r') as f: s = f.read() model = pickle.loads(s) return model def pickRandomSeq(self, length = 100): symSequence = [random.randint(0, 255) for idx in range(length)] return symSequence def pickOrderedSeq(self, length = 100): symSequence = [] taggedWordsIter = brown.tagged_words() maxIdx = len(taggedWordsIter) maxMinIdx = maxIdx - length minIdx = random.randint(0, maxMinIdx) count = 0 idx = minIdx while True: (wrd, tag) = taggedWordsIter[idx] if wrd: for c in wrd: val = ord(c) symSequence.append(val) count += 1 idx += 1 if count >= length: break return symSequence def printHMM(self): print("A = %s" % str(self._hmm.transmat_)) print("B = %s" % str(self._hmm.emissionprob_)) print("PI = %s" % str(self._hmm.startprob_)) print("Verify A = %s" % np.sum(self._hmm.transmat_, axis=1)) print("Verify B = %s" % np.sum(self._hmm.emissionprob_, axis=1)) print("Verify PI = %s" % np.sum(self._hmm.startprob_, axis=0)) def histo(self): retHisto = dict((x, self._syms.count(x)) for x in range(256)) return retHisto
n_states = 2 n_emissions = len(possible_emissions) # Training data X = np.array([ random.sample(possible_emissions, len(possible_emissions)), [1, 1, 2, 1], [6, 5, 5, 4, 7, 7] ]) lengths = [len(row) for row in X] X = np.atleast_2d(np.concatenate(X)) # Create randomly initialized model model = MultinomialHMM(n_components=2, n_iter=100) # Train on data model.fit(X.T, lengths=lengths) # Trained parameters transition_matrix = model.transmat_ emission_matrix = model.emissionprob_ initial_stat_probability = model.startprob_ # Observed sequence observed = np.array([1, 1, 1]) # Get helmet helmet = 0 # Sequence emissions helping_emission = (helmet, 1, 0) backstabbing_emission = (helmet, 0, 1)
def train(self, data, labels, tp=None): labels = np.array(labels) for i in range(self.nb_class): print "Class", i ind = np.where(labels == i) digit_data = np.array(data)[ind] self.fit_encode_class(digit_data, i) sks, lengths = self.transform_encode_class(digit_data, i) if not tp: model = MultinomialHMM(n_components=self.nb_components, n_iter=self.max_iter, tol=self.tol, verbose=True, params='ste', init_params='e') init = 1. / self.nb_components model.startprob_ = np.full(self.nb_components, init) model.transmat_ = np.full((self.nb_components, self.nb_components), init) else: model = model = MultinomialHMM(n_components=self.nb_components, n_iter=self.max_iter, tol=self.tol, verbose=True, params='ste') # Number of distinct centroids num_obs = len(np.unique(np.concatenate(sks))) model.emissionprob_ = np.zeros((self.nb_components, num_obs)) hist = {} curr = 0 bucket_len = num_obs / self.nb_components for j in range(self.nb_components): if j == self.nb_components - 1 and curr + bucket_len < num_obs: offset = num_obs - curr - bucket_len for k in range(curr, curr + bucket_len + offset): if not j in hist: hist[j] = [] hist[j].append(k) model.emissionprob_[j, k] = 1 curr += bucket_len + offset else: for k in range(curr, curr + bucket_len): if not j in hist: hist[j] = [] hist[j].append(k) model.emissionprob_[j, k] = 1 curr += bucket_len model.startprob_ = np.zeros(self.nb_components) # always ends by penup model.startprob_[-1] = 1 model.transmat_ = np.zeros((self.nb_components, self.nb_components)) state_occ_count = np.zeros(self.nb_components) for example in digit_data: j = 0 prevobs = 0 for obs in example: le = self.les[i] val = le.transform(obs) if j == 0: prevobs = val j += 1 continue prevobs_state = None obs_state = None for k in range(self.nb_components): if (prevobs_state != None and obs_state != None): break if prevobs in hist[k]: prevobs_state = k if val in hist[k]: obs_state = k state_occ_count[prevobs_state] += 1 model.transmat_[prevobs_state, obs_state] += 1 prevobs = val j += 1 for j in range(self.nb_components): for k in range(self.nb_components): model.transmat_[j, k] = model.transmat_[j, k] / state_occ_count[j] model.fit(sks, lengths) self.models[i] = model
def convert(string): # mapping function, map A->0 , B->1 , C->3 ... output = [] for character in string: if character is " ": number = 26 else: number = ord(character) - 65 output.append(number) return (output) data2 = convert(data) #Convert the data from stream of chacters to stream of numbers DD = np.array(data2) Data_arr = DD.reshape((DD.shape[0],1)) model = MultinomialHMM(n_components=2,n_iter=200, tol=0.01, verbose=False) print("Training started") model.fit(Data_arr) print("Training Done") print("Model = ",model.monitor_) print("The transition prob of this trained model : ") print(model.transmat_) emiso = np.transpose(model.emissionprob_) print("\nThe emmision prob of this trained model : ") print(" State-0 State-1") print(emiso) seven_most_probabe(emiso) #printing the 7 most likely characters print("Stationary probbabilities : ",model. get_stationary_distribution()) print("So seeing the emission probabilities we can say that State 1 is Consonant and State 0 is Vowel") print("\nTask - 4") model_nat = MultinomialHMM(n_components=2) model_nat.transmat_ = trans_prob
if train_data['return'][i] < 0.0 and analysis_data['v'][i] == 2: emission_probability[1][2] += 1 if train_data['return'][i] < 0.0 and analysis_data['v'][i] == 3: emission_probability[1][3] += 1 emission_probability[0] /= sum(1 for e in train_data['return'] if e >= 0.0) emission_probability[1] /= sum(1 for e in train_data['return'] if e < 0.0) #print(emission_probability) hmm = MultinomialHMM(n_components=n_states) hmm.startprob = start_probability hmm.transmat = transition_probability hmm.emissionprob = emission_probability bob_says = np.array([[0, 2, 1, 1, 2, 0]]).T hmm = hmm.fit(bob_says) logprob, alice_hears = hmm.decode(bob_says, algorithm="viterbi") print("Bob says:", ", ".join(map(lambda x: observations[x], bob_says))) print("Alice hears:", ", ".join(map(lambda x: states[x], alice_hears))) ''' law_data['hmm_states'] = hmm.predict(rets) panel = Figure_Util.Figure() panel.draw(law_data, title='close', subplots=['hmm_states'], figsize=(20, 10)) ''' db.disconnect()
def test_DiscreteHMM_fit(cases: str) -> None: np.random.seed(12346) cases = int(cases) i = 1 N_decimal = 4 max_iter = 100 tol=1e-3 while i < cases: n_samples = np.random.randint(10, 50) hidden_states = np.random.randint(3, 6) # symbols is the number of unqiue observation types. symbols = np.random.randint(4, 9) X = [] lengths = [] for _ in range(n_samples): # the actual length is seq_length + 1 seq_length = symbols this_x = np.random.choice(range(symbols), size=seq_length, replace=False) X.append(this_x) lengths.append(seq_length) A = np.full((hidden_states, hidden_states),1/hidden_states) B = [] for _ in range(hidden_states): this_B = np.random.dirichlet(np.ones(symbols),size=1)[0] B.append(this_B) B = np.array(B) pi = np.ones(hidden_states) pi = pi/hidden_states hmm_gold = MultinomialHMM(n_components=hidden_states, startprob_prior=1, transmat_prior=1, init_params='', n_iter=max_iter, tol=tol) hmm_gold.transmat_ = A hmm_gold.emissionprob_ = B hmm_gold.startprob_ = pi X_gold = np.concatenate(X).reshape((-1,1)) hmm_gold.fit(X_gold, lengths) gold_A = hmm_gold.transmat_ gold_B = hmm_gold.emissionprob_ gold_pi = hmm_gold.startprob_ hmm_mine = DiscreteHMM(hidden_states=hidden_states, symbols=symbols, A=A, B=B, pi=pi, tol=tol, max_iter=max_iter) hmm_mine.fit(X) mine_A = hmm_mine.A mine_B = hmm_mine.B mine_pi = hmm_mine.pi assert_almost_equal(mine_pi, gold_pi, decimal=N_decimal) assert_almost_equal(mine_A, gold_A, decimal=N_decimal) assert_almost_equal(mine_B, gold_B, decimal=N_decimal) i+=1 print('Successfully testing the function of estimating parameters in discrete HMM!')
def computeHMM(dataset, alphabet, num_matchstates=9): num_sequences = len(dataset) best_score = None best_model = None alphabet = list(alphabet) residue_mapper = {alphabet[j]: j for j in range(0, len(alphabet))} #one begin, one end, num_matchstates + 1 insert states, num_matchstates match states, num_matchstates deletion states. num_states = 3 + 3 * num_matchstates concat_dataset = np.concatenate([[[residue_mapper[x]] for x in y] for y in dataset]) dataset_lengths = [len(x) for x in dataset] for x in range(0, 10): transition_matrix = np.zeros((num_states, num_states)) emission_matrix = np.zeros((num_states, len(alphabet))) #first num_matchstates + 2 are the matchstates (including beginning and end, though those two are mute #first do B, then M_1,...,M_m #B goes to either I_0 or M_1. b_row = ProfileHMM.compute_random_row(2) transition_matrix[0][1] = b_row[0] transition_matrix[0][2] = b_row[1] for i in range(1, num_matchstates + 1): #go to either match state, insertion state, or delete state. m_row = ProfileHMM.compute_random_row(3) #next match state transition_matrix[i][i + 1] = m_row[0] #insert state transition_matrix[i][i + num_matchstates + 2] = m_row[1] #deletion state print('i: %d' % i) transition_matrix[i][i + 2 * num_matchstates + 2] = m_row[2] emission_matrix[i] = ProfileHMM.compute_random_row( len(alphabet)) #now we do the insertion states. for i in range(num_matchstates + 2, 2 * num_matchstates + 3): #either go to self, or next match state. row = ProfileHMM.compute_random_row(2) transition_matrix[i][i] = row[0] transition_matrix[i][i - (num_matchstates + 1)] = row[1] emission_matrix[i] = ProfileHMM.compute_random_row( len(alphabet)) #now do deletion states. In the loop, do all but the last one for i in range(2 * num_matchstates + 3, 3 * num_matchstates + 2): row = ProfileHMM.compute_random_row(2) transition_matrix[i][i] = row[0] transition_matrix[i][i - 2 * num_matchstates - 1] = row[1] model = MultinomialHMM(num_states, params="ets") model.n_features = len(alphabet) start_prob = np.zeros(num_states) start_prob[0] = 1.0 print('start prob array') print(start_prob) model.startprob_ = start_prob model.transmat_ = transition_matrix model.emissionprob_ = emission_matrix try: model.fit(concat_dataset, dataset_lengths) except ValueError: pdb.set_trace() print('model') print(model) """ for row in range(0, len(model.emissionprob_)): for col in range(0, len(model.emissionprob_[row])): count = model.emissionprob_[row][col]*num_sequences model.emissionprob_[row][col] = (count + 0.01)/(num_sequences + len(alphabet)*0.01) """ print('emission probabilities') print(model.emissionprob_) score = model.score(concat_dataset, dataset_lengths) if x == 0: best_score = score best_model = model elif score > best_score: best_score = score best_model = model return best_model
model = MultinomialHMM(n_components=3, random_state=42, params='e', init_params='e') model.startprob_ = [0.16, 0.04, 0.8] model.transmat_ = [[0.67, 0.13, 0.2], [0, 0.5, 0.5], [0, 0, 1]] ''' startprob_: V: D: J: trasmat_: from each row, row become column (the probability of row become column, each row sum to 1, not column) ''' model.fit(X, length) emission = model.emissionprob_ model.n_features model.transmat_ model.startprob_ model.get_stationary_distribution() # let's test train_score = [] for i in cdr3_train_index: test = string2matrix_plain(cdr3[i]).astype(np.int) score = model.score(test) train_score.append(score) train_score = np.array(train_score) test_score = [] for i in cdr3_test_index:
def main(params): DEBUG = params['DEBUG'] dataset = params['dataset'] nh_part = params['nh_part'] nh_chords = params['nh_chords'] num_gen = params['num_gen'] ################################################################## # DATA PROCESSING # Songs indices song_indices = [ 43, 85, 133, 183, 225, 265, 309, 349, 413, 471, 519, 560, 590, 628, 670, 712, 764, 792, 836, 872, 918, 966, 1018, 1049, 1091, 1142, 1174, 1222, 1266, 1278, 1304, 1340, 1372, 1416, 1456, 1484, 1536, 1576, 1632, 1683, 1707, 1752, 1805, 1857, 1891, 1911 ] # Chords mapping chord_names = [ 'C;Em', 'A#;F', 'Dm;Em', 'Dm;G', 'Dm;C', 'Am;Em', 'F;C', 'F;G', 'Dm;F', 'C;C', 'C;E', 'Am;G', 'F;Em', 'F;F', 'G;G', 'Am;Am', 'Dm;Dm', 'C;A#', 'Em;F', 'C;G', 'G#;A#', 'F;Am', 'G#;Fm', 'Am;Gm', 'F;E', 'Dm;Am', 'Em;Em', 'G#;G#', 'Em;Am', 'C;Am', 'F;Dm', 'G#;G', 'F;A#', 'Am;G#', 'C;D', 'G;Am', 'Am;C', 'Am;A#', 'A#;G', 'Am;F', 'A#;Am', 'E;Am', 'Dm;E', 'A;G', 'Am;Dm', 'Em;Dm', 'C;F#m', 'Am;D', 'G#;Em', 'C;Dm', 'C;F', 'G;C', 'A#;A#', 'Am;Caug', 'Fm;G', 'A;A' ] # Import .mat file dataset_root = os.path.join('data', dataset) mat_path = os.path.join(dataset_root, 'data.mat') data_mat = sio.loadmat(mat_path) chords_per_part = 2 chords_per_bar = 4 num_chords = 56 num_parts = 4 sub_sampling_ratio_parts = chords_per_bar / chords_per_part # Get parts parts_data_ = (np.dot(np.transpose(data_mat["feats"][-num_parts:]), np.asarray(range(num_parts))).astype(int)).reshape( -1, 1) # Group by bar parts_data = parts_data_[::sub_sampling_ratio_parts] # Parts with position in bar. Used condition chords generation parts_bar_data = post_processing_parts(parts_data, sub_sampling_ratio_parts) # Get chords transitions chords_data = (np.dot(np.transpose(data_mat["feats"][:-num_parts]), np.asarray(range(num_chords))).astype(int)).reshape( -1, 1) ################################# # Group by song parts_length = [] chords_length = [] start_ind = 0 for end_ind in song_indices: chords_length.append(end_ind - start_ind + 1) start_ind = end_ind + 1 parts_length = [e / 2 for e in chords_length] ################################################################## ################################################################## # PARTS # Compute HMM for part modeling hmm_part = MultinomialHMM(n_components=nh_part, n_iter=20) hmm_part.fit(parts_data, parts_length) # def plot_mat(matrix, name): # fig = plt.figure() # ax = fig.add_subplot(1,1,1) # ax.set_aspect('equal') # plt.imshow(matrix, interpolation='nearest', cmap=plt.cm.ocean) # plt.colorbar() # plt.savefig(name, format='pdf') # plot_mat(hmm_part.transmat_, 'part_transmat.pdf') # plot_mat(np.reshape(hmm_part.startprob_, [-1, 1]), 'part_startprob.pdf') # plot_mat(hmm_part.emissionprob_, 'part_emissionprob.pdf') ################################################################## ################################################################## # CHORDS hmm_chords = MultinomialHMM_prod(n_components=nh_chords, n_iter=20) hmm_chords.fit(chords_data, chords_length) # plot_mat(hmm_chords.transmat_, 'chords_transmat.pdf') # plot_mat(np.reshape(hmm_chords.startprob_, [-1, 1]), 'chords_startprob.pdf') # plot_mat(hmm_chords.emissionprob_, 'chords_emissionprob.pdf') ################################################################## ################################# # GENERATION # Sample sequence for n in range(num_gen): gen_part_sequence_, _ = hmm_part.sample(params["gen_seq_length"]) gen_part_sequence = post_processing_parts(gen_part_sequence_, sub_sampling_ratio_parts) # Compute conditioning on parts p_chords_given_partBar = build_proba(chords_data, parts_bar_data) gen_chord_sequence, _ = hmm_chords.sampling_prod_hmm( p_chords_given_partBar, gen_part_sequence) ######## T E S T ################ # Independent HMM ? # gen_chord_sequence, _ = hmm_chords.sampling(n_samples=44) ################################## if params["DEBUG"]: with open("results_chords/" + str(n), 'wb') as f: for count, (part, chord) in enumerate( zip(gen_part_sequence, gen_chord_sequence)): if count % 2 == 0: f.write( str(part / 2) + " ; " + chord_names[chord[0]] + "\n") else: f.write(" ; " + chord_names[chord[0]] + "\n") if count % 8 == 7: f.write("\n") gen_part_sequence = [e / 2 for e in gen_part_sequence] return gen_part_sequence, gen_chord_sequence, num_chords, num_parts
def train_syllable_hmm(song_corpus, n_iterations=50): hmm = MultinomialHMM(3) hmm.transmat_ = np.array([[0, 0, 1], [1, 0, 0], [0, .01, .99]]) hmm.n_iter = n_iterations hmm.fit([np.concatenate(song_corpus)]) return hmm
discrete_obs, delta_hws, delta_fas = [], [], [] for idx in mice: d = _data_on_mouse(data, idx, smoothing_time_radius, smoothing_amplitude_radius, smoothing_tolerance, sampling_interval, bins) discrete_obs.append(d[0]) delta_hws.append(d[1]) delta_fas.append(d[2]) X = np.array(discrete_obs) model = MultinomialHMM(n_components=n_components) predictions = [] for i in range(7): held_out_X = np.vstack((X[:i], X[i + 1:])) model.fit(held_out_X) predictions.append(model.decode(X[i].reshape(X[i].shape[0], 1))) f, axarr = plt.subplots(7, 1) yranges = np.arange(n_components + 1, dtype=float) / n_components colors = plt.cm.rainbow(np.linspace(0, 1, n_components)) for i in range(7): states, indices = _axvspan_maker(predictions[i][1]) for s, idxs in zip(states, indices): axarr[i].axvspan(idxs[0], idxs[1], ymin=yranges[s], ymax=yranges[s + 1], color=colors[s]) plt.show()
target_test = np.hstack([d["label"] for d in dataset[split_pos:]]) # Train scaler scaler = StandardScaler() scaler.fit(feature_train) feature_train = scaler.transform(feature_train) # Train random forest classifier clf = RandomForestClassifier() clf.fit(feature_train, target_train) # Train HMM pred_probs = clf.predict_proba(feature_train)[:, 1] pred_labels = np.array([map_pred(x) for x in pred_probs], dtype=np.int64) hmm = MultinomialHMM(n_components=2, startprob_prior=np.array([0.5, 0.5]), transmat_prior=np.array([ [0.8, 0.2], [0.2, 0.8], ])) hmm.fit(pred_labels.reshape(-1, 1)) # Evaluation of the entire procedure predict_results = infer(feature_test, scaler, clf, hmm) print(classification_report(target_test, predict_results)) # Save models pickle.dump(scaler, open(path.join(project_dir, "models/scaler.pkl"), "wb")) pickle.dump(clf, open(path.join(project_dir, "models/clf.pkl"), "wb")) pickle.dump(hmm, open(path.join(project_dir, "models/hmm.pkl"), "wb"))
for idx in mice: d = _data_on_mouse(data, idx, smoothing_time_radius, smoothing_amplitude_radius, smoothing_tolerance, sampling_interval, bins) discrete_obs.append(d[0]) delta_hws.append(d[1]) delta_fas.append(d[2]) X = np.array(discrete_obs) model = MultinomialHMM(n_components = n_components) predictions = [] for i in range(7): held_out_X = np.vstack((X[:i], X[i+1:])) model.fit(held_out_X) predictions.append(model.decode(X[i].reshape(X[i].shape[0], 1))) f, axarr = plt.subplots(7, 1) yranges = np.arange(n_components+1, dtype=float)/n_components colors = plt.cm.rainbow(np.linspace(0, 1, n_components)) for i in range(7): states, indices = _axvspan_maker(predictions[i][1]) for s, idxs in zip(states, indices): axarr[i].axvspan(idxs[0], idxs[1], ymin=yranges[s], ymax=yranges[s+1], color=colors[s]) plt.show() # healthy_model = MultinomialHMM(n_components = n_components) # healthy_model.fit(dos) # hs_preds = healthy_model.predict(dos.reshape(len(dos), 1))
def train_on_X(X): X_train_hmm, X_train_lengths = transform_X_for_hmm(X) clf = MultinomialHMM(n_components=n_components, n_iter=n_iter) clf.fit(X_train_hmm, lengths=X_train_lengths) return clf
final_array = [] count_array = [] for x in input_sequences: count = 0 for y in x: count += 1 final_array.append(y) count_array.append(count) data = np.loadtxt('train.csv' , delimiter=',') sample_vector = np.array(final_array) sequence_lengths = np.array(count_array) num_components = 3 model = MultinomialHMM(n_components=num_components , n_iter = 1000) model.fit(sample_vector , lengths = sequence_lengths) #------------------------------------------------------------------------------------------------------------------------ print("Second Phase") validating_sequences = [] # for idx in train.index: count =0 idx = 0 flag1 = True flag2 = True while flag1: temp_list = [] flag2 = True
high = high + 1 elif percent >= .50: highMid = highMid + 1 elif percent >= .25: lowMid = lowMid + 1 else: low = low + 1 matrix[1, 0] = low / len(wins) matrix[1, 1] = lowMid / len(wins) matrix[1, 2] = highMid / len(wins) matrix[1, 3] = high / len(wins) return matrix # Load Data filename = 'data.csv' X = np.loadtxt(filename, delimiter=',') player1 = X[:, 0] player2 = X[:, 1] record = X[:, 2] print "stateProbs(record)", stateProbs(record) print "eProbs(player1, record", eProbs(player1, record) clf = MultinomialHMM(n_components=2) clf.transmat_ = stateProbs(record) clf.emissionprob_ = eProbs(player1, record) print "here" clf.fit(clf.transmat_, clf.emissionprob_) clf.predict(player1)
class TOE_HMM: def __init__(self, N=2, maxIters = 200): self._N = N self._M = len(WHITE_LIST) self._pi = self.randProbMat(1,N)[0] self._A = self.equiProbMat(N, N) self._B = self.equiProbMat(N, self._M) self._maxIters = maxIters self._syms = [] def randProbMat(self, M, N): ret = np.random.rand(M,N) ret = ret/ret.sum(axis=1)[:,None] return ret def equiProbMat(self, M, N): ret = np.ones((M,N), dtype=float) ret = ret/ret.sum(axis=1)[:,None] return ret def loadBrownSymsSeq(self, T): taggedWordsIter = brown.tagged_words() retIdx = 0 iterIdx = 0 symSequence = [] for wrd, tag in taggedWordsIter: if retIdx >= T: break if tag in WHITE_LIST: val = WHITE_LIST.index(tag) symSequence.append(val) retIdx += 1 iterIdx += 1 self._syms = symSequence return symSequence def textSeqToSymSeq(self, txtSeqArr): tags = nltk.pos_tag(txtSeqArr) #PerceptronTagger tags = [t[1] for t in tags] tags = [WHITE_LIST.index(t) for t in tags if t in WHITE_LIST] return tags def initHMM(self): # self._hmm = MultinomialHMM(n_components=self._N, startprob_prior=None, transmat_prior=None, # algorithm='viterbi', random_state=None, n_iter=self._maxIters, tol=0.01, # verbose=True, params='ste', init_params='ste') self._hmm = MultinomialHMM(n_components=self._N, n_iter=self._maxIters, verbose=True, params='ste', init_params='ste') # self._hmm.emissionprob_ = self._B # n_features (int) Number of possible symbols emitted by the model (in the samples). # monitor_ (ConvergenceMonitor) Monitor object used to check the convergence of EM. # transmat_ (array, shape (n_components, n_components)) Matrix of transition probabilities between states. # startprob_ (array, shape (n_components, )) Initial state occupation distribution. # emissionprob_ (array, shape (n_components, n_features)) Probability of emitting a given symbol when in each state. def trainHMM(self): self._hmm.fit(np.array(self._syms).reshape(-1, 1)) def testTxt(self, txtSeqArr): testSymsArr = self.textSeqToSymSeq(txtSeqArr) score = self._hmm.score(testSymsArr) return score def testSyms(self, symsArr): # testSymsArr = self.textSeqToSymSeq(txtSeqArr) score = self._hmm.score(symsArr) return score def persistHMM(self, filename): import pickle s = pickle.dumps(self) with open(filename, 'w') as f: f.write(s) @staticmethod def loadHMM(filename): import pickle with open(filename, 'r') as f: s = f.read() model = pickle.loads(s) return model def pickRandomSeq(self, length = 100): symSequence = [] taggedWordsIter = brown.tagged_words() maxIdx = len(taggedWordsIter) import random idx = 0 while idx < length: wrdIdx = random.randint(0, maxIdx) (wrd, tag) = taggedWordsIter[wrdIdx] if tag in WHITE_LIST: val = WHITE_LIST.index(tag) symSequence.append(val) idx += 1 return symSequence def pickOrderedSeq(self, length = 100): symSequence = [] taggedWordsIter = brown.tagged_words() maxIdx = len(taggedWordsIter) maxMinIdx = maxIdx - length import random minIdx = random.randint(0, maxMinIdx) count = 0 for idx in range(minIdx, maxIdx): (wrd, tag) = taggedWordsIter[idx] if tag in WHITE_LIST: val = WHITE_LIST.index(tag) symSequence.append(val) idx += 1 count += 1 if count >= length: break return symSequence def printHMM(self): print("A = %s" % str(self._hmm.transmat_)) print("B = %s" % str(self._hmm.emissionprob_)) print("PI = %s" % str(self._hmm.startprob_)) print("Verify A = %s" % np.sum(self._hmm.transmat_, axis=1)) print("Verify B = %s" % np.sum(self._hmm.emissionprob_, axis=1)) print("Verify PI = %s" % np.sum(self._hmm.startprob_, axis=0)) def histo(self): retHisto = dict((x, self._syms.count(x)) for x in range(len(WHITE_LIST))) retHisto = dict((WHITE_LIST[k], val) for k, val in retHisto.items()) return retHisto