def update_hmm(self): num_states = self.num_states start_prob = self.start_prob num_emissions = self.num_emissions hmm = HiddenMarkovModel('hmm') dist = [ DiscreteDistribution( dict(zip(range(num_emissions), self.emissions[i]))) for i in range(num_states) ] states = [ State(dist[i], 's' + str(i).zfill(2)) for i in range(num_states) ] hmm.add_states(states) for i in range(num_states): s_i = states[i] hmm.add_transition(hmm.start, s_i, start_prob[i]) for j in range(num_states): s_j = states[j] p = self.transitions[i, j] hmm.add_transition(s_i, s_j, p) self.hmm = hmm self.hmm.bake()
def load_segmentation_model(modeldata): model = HiddenMarkovModel('model') states = {} for s in modeldata: if len(s['emission']) == 1: emission = NormalDistribution(*s['emission'][0][:2]) else: weights = np.array([w for _, _, w in s['emission']]) dists = [NormalDistribution(mu, sigma) for mu, sigma, _ in s['emission']] emission = GeneralMixtureModel(dists, weights=weights) state = State(emission, name=s['name']) states[s['name']] = state model.add_state(state) if 'start_prob' in s: model.add_transition(model.start, state, s['start_prob']) for s in modeldata: current = states[s['name']] for nextstate, prob in s['transition']: model.add_transition(current, states[nextstate], prob) model.bake() return model
def oriHMMParams(self): """ Set initial parameters for the Hidden Markov Model (HMM). Attributes ---------- HMMParams : dict Has 3 keys: "A", state transition matrix, "B" (emission probabilities), specifying parameters (Means, Variances, Weights) of the mixture Gaussian distributions for each hidden state, and "pi", indicating the hidden state weights. This dict will be updated after learning procedure. """ hmm = HiddenMarkovModel() # GMM emissions # 5 Hidden States: # 0--start, 1--downstream, 2--no bias, 3--upstream, 4--end numdists = 3 # Three-distribution Gaussian Mixtures var = 7.5 / (numdists - 1) means = [[], [], [], [], []] for i in range(numdists): means[4].append(i * 7.5 / (numdists - 1) + 2.5) means[3].append(i * 7.5 / (numdists - 1)) means[2].append((i - (numdists - 1) / 2) * 7.5 / (numdists - 1)) means[1].append(-i * 7.5 / (numdists - 1)) means[0].append(-i * 7.5 / (numdists - 1) - 2.5) states = [] for i, m in enumerate(means): tmp = [] for j in m: tmp.append(NormalDistribution(j, var)) mixture = GeneralMixtureModel(tmp) states.append(State(mixture, name=str(i))) hmm.add_states(*tuple(states)) # Transmission matrix #A = [[0., 1., 0., 0., 0.], # [0., 0.4, 0.3, 0.3, 0.], # [0.05, 0., 0.5, 0.45, 0.], # [0., 0., 0., 0.5, 0.5], # [0.99, 0., 0.01, 0., 0.]] hmm.add_transition(states[0], states[1], 1) hmm.add_transition(states[1], states[1], 0.4) hmm.add_transition(states[1], states[2], 0.3) hmm.add_transition(states[1], states[3], 0.3) hmm.add_transition(states[2], states[0], 0.05) hmm.add_transition(states[2], states[2], 0.5) hmm.add_transition(states[2], states[3], 0.45) hmm.add_transition(states[3], states[3], 0.5) hmm.add_transition(states[3], states[4], 0.5) hmm.add_transition(states[4], states[0], 0.99) hmm.add_transition(states[4], states[2], 0.01) pi = [0.05, 0.3, 0.3, 0.3, 0.05] for i in range(len(states)): hmm.add_transition(hmm.start, states[i], pi[i]) hmm.bake() return hmm
def build_an_hmm_example(): # i think the characters in each DiscreteDistribution definition, means the emission matrix for each state # because it says the probability of seeing each character when the system is in that state d1 = DiscreteDistribution({'A': 0.35, 'C': 0.20, 'G': 0.05, 'T': 0.40}) d2 = DiscreteDistribution({'A': 0.25, 'C': 0.25, 'G': 0.25, 'T': 0.25}) d3 = DiscreteDistribution({'A': 0.10, 'C': 0.40, 'G': 0.40, 'T': 0.10}) s1 = State(d1, name="s1") s2 = State(d2, name="s2") s3 = State(d3, name="s3") model = HiddenMarkovModel('example') model.add_states([s1, s2, s3]) model.add_transition(model.start, s1, 0.90) model.add_transition(model.start, s2, 0.10) model.add_transition(s1, s1, 0.80) model.add_transition(s1, s2, 0.20) model.add_transition(s2, s2, 0.90) model.add_transition(s2, s3, 0.10) model.add_transition(s3, s3, 0.70) model.add_transition(s3, model.end, 0.30) model.bake() for i in range(len(model.states)): print(model.states[i].name) model.plot() #print(model.log_probability(list('ACGACTATTCGAT'))) #print(", ".join(state.name for i, state in model.viterbi(list('ACGACTATTCGAT'))[1])) print("forward:", model.forward(list('ACG')))
def bake_model(tags_sequence, words_sequence): """ 'tags' are the time-demand labels that generate the emitted demand level. Demand level are represented by 'words' """ # rdemand words = [x for x in chain(*words_sequence)] tag_unigrams = unigram_counts(words) tag_bigrams = bigram_counts(words) # Uniform distribution for starting and ending labels all_labels = list(set(words)) tag_starts = starting_counts(all_labels) tag_ends = ending_counts(all_labels) basic_model = HiddenMarkovModel(name="base-hmm-tagger") # Emission count label_train = tags_sequence rdemand_train = words_sequence emission_count = pair_counts(rdemand_train, label_train) # States with emission probability distributions P(word | tag) states = [] for rdemand, label_dict in emission_count.items(): dist_tag = DiscreteDistribution({ label: cn / tag_unigrams[rdemand] for label, cn in label_dict.items() }) states.append(State(dist_tag, name=rdemand)) basic_model.add_states(states) state_names = [s.name for s in states] state_index = {tag: num for num, tag in enumerate(state_names)} # Start transition total_start = sum(tag_starts.values()) for tag, cn in tag_starts.items(): # sname = state_index[tag] basic_model.add_transition(basic_model.start, states[state_index[tag]], cn / total_start) # End transition total_end = sum(tag_ends.values()) for tag, cn in tag_ends.items(): basic_model.add_transition(states[state_index[tag]], basic_model.end, cn / total_end) # Edges between states for the observed transition frequencies P(tag_i | tag_i-1) for key, value in tag_bigrams.items(): basic_model.add_transition(states[state_index[key[0]]], states[state_index[key[1]]], value / tag_unigrams[key[0]]) # Finalize the model basic_model.bake() return basic_model
def hmmer2pom(hmm): # set up environment from math import exp from pomegranate import DiscreteDistribution,HiddenMarkovModel,State tags = dict(); header = 0; alphabet = None; hmmlines = list() # parse HMMER file for line in hmm.splitlines(): l = line.strip() if len(l) == 0 or l[0] == '#': continue elif header == 0: if l.startswith('HMM') and l[3] != 'E': # beginning of actual HMM header = 1; alphabet = l.split()[1:] else: parts = l.strip().split() if parts[0] in tags: if not isinstance(tags[parts[0]], list): tags[parts[0]] = [tags[parts[0]]] tags[parts[0]].append(' '.join(parts[1:])) else: tags[parts[0]] = ' '.join(parts[1:]) elif header == 1: header = 2 else: if l.startswith('COMPO'): parts = l.strip().split(); tags[parts[0]] = ' '.join(parts[1:]) else: hmmlines.append(l) # create all states model = HiddenMarkovModel(tags['NAME']); tmpstates = list(); K = 0 i_emit = hmmlines[0].split(); tmpstates.append(State(DiscreteDistribution({alphabet[i] : exp(-1*float(i_emit[i])) for i in range(len(alphabet))}), name="I0")) # insertion state for l in range(2,len(hmmlines),3): m_emit,i_emit,state_trans = [hmmlines[l+i].split() for i in range(0,3)]; K = int(m_emit[0]) tmpstates.append(State(DiscreteDistribution({alphabet[i] : exp(-1*float(m_emit[i+1])) for i in range(len(alphabet))}), name="M%d" % K)) # match state tmpstates.append(State(DiscreteDistribution({alphabet[i] : exp(-1*float(i_emit[i])) for i in range(len(alphabet))}), name="I%d" % K)) # insertion state tmpstates.append(State(None, name="D%d" % K)) # deletion state assert K != 0, "No match states in profile HMM" model.add_states(tmpstates); name2state = {state.name:state for state in tmpstates}; name2state["M0"] = model.start; name2state["M%d"%(K+1)] = model.end # create all transitions for l in range(1,len(hmmlines),3): k = int(l/3); parts = hmmlines[l].split() model.add_transition(name2state["M%d"%k], name2state["M%d"%(k+1)], exp(-1*float(parts[0]))) # 0: M_k -> M_k+1 model.add_transition(name2state["M%d"%k], name2state["I%d"%k], exp(-1*float(parts[1]))) # 1: M_k -> I_k if parts[2] != '*': # no D_k+1 in last row model.add_transition(name2state["M%d"%k], name2state["D%d"%(k+1)], exp(-1*float(parts[2]))) # 2: M_k -> D_k+1 model.add_transition(name2state["I%d"%k], name2state["M%d"%(k+1)], exp(-1*float(parts[3]))) # 3: I_k -> M_k+1 model.add_transition(name2state["I%d"%k], name2state["I%d"%k], exp(-1*float(parts[4]))) # 4: I_k -> I_k if k != 0: # no D0 state model.add_transition(name2state["D%d"%k], name2state["M%d"%(k+1)], exp(-1*float(parts[5]))) # 5: D_k -> M_k+1 if parts[6] != '*': # no D0 state and no D_k+1 in last row model.add_transition(name2state["D%d"%k], name2state["D%d"%(k+1)], exp(-1*float(parts[6]))) # 6: D_k -> D_k+1 model.bake() return model.to_json()
def train_hmm_tagger(data): # HMM # Use the tag unigrams and bigrams calculated above to construct a hidden Markov tagger. # # - Add one state per tag # - The emission distribution at each state should be estimated with the formula: $P(w|t) = \frac{C(t, w)}{C(t)}$ # - Add an edge from the starting state `basic_model.start` to each tag # - The transition probability should be estimated with the formula: $P(t|start) = \frac{C(start, t)}{C(start)}$ # - Add an edge from each tag to the end state `basic_model.end` # - The transition probability should be estimated with the formula: $P(end|t) = \frac{C(t, end)}{C(t)}$ # - Add an edge between _every_ pair of tags # - The transition probability should be estimated with the formula: $P(t_2|t_1) = \frac{C(t_1, t_2)}{C(t_1)}$ basic_model = HiddenMarkovModel(name="base-hmm-tagger") state_dict = {} states = [] emission_counts = pair_counts(*list(zip( *data.training_set.stream()))[::-1]) for tag in emission_counts.keys(): tag_count = tag_unigrams[tag] probs = {} for w in emission_counts[tag]: probs[w] = emission_counts[tag][w] / tag_count emission_p = DiscreteDistribution(probs) state = State(emission_p, name="" + tag) basic_model.add_state(state) state_dict[tag] = state for tag in tag_starts: basic_model.add_transition(basic_model.start, state_dict[tag], tag_starts[tag] / len(data.training_set.Y)) basic_model.add_transition(state_dict[tag], basic_model.end, tag_ends[tag] / tag_unigrams[tag]) for (tag1, tag2) in tag_bigrams: basic_model.add_transition( state_dict[tag1], state_dict[tag2], tag_bigrams[(tag1, tag2)] / tag_unigrams[tag1]) # finalize the model basic_model.bake() assert all( tag in set(s.name for s in basic_model.states) for tag in data.training_set.tagset ), "Every state in your network should use the name of the associated tag, which must be one of the training set tags." assert basic_model.edge_count() == 168, ( "Your network should have an edge from the start node to each state, one edge between every " + "pair of tags (states), and an edge from each state to the end node.") HTML( '<div class="alert alert-block alert-success">Your HMM network topology looks good!</div>' ) return basic_model
def insert_delete_main_hmm(data_matrix): v_columns = column_clasify(data_matrix) v_zones = create_zones(v_columns) v_grouped_states = group_states(v_zones, 'test') v_model = HiddenMarkovModel() v_first_state = State(None, name='ali_start') v_last_state = State(None, name='ali_end') v_model.add_state(v_first_state) v_model.add_transition(v_model.start, v_first_state, 1) v_model.add_state(v_last_state) add_states(v_model, v_grouped_states) v_trans = calculate_transitions(v_first_state, v_last_state, v_grouped_states) apply_transitions(v_model, v_trans) v_model.bake() return v_model
def _initialize_new_hmm(hmm, new_states, new_transitions): new_hmm = HiddenMarkovModel() for state in new_states: if state not in (hmm.start, hmm.end): new_hmm.add_state(state) for source_state, target_state, probability in new_transitions: if source_state != hmm.start and target_state != hmm.end: new_hmm.add_transition(source_state, target_state, probability) elif source_state == hmm.start: new_hmm.add_transition(new_hmm.start, target_state, probability) elif target_state == hmm.end: new_hmm.add_transition(source_state, new_hmm.end, probability) new_hmm.bake() return new_hmm
def init_model(start_dip, stay_state, mean_eu, sd_eu, mean_loh): ## define distributions d_eu = NormalDistribution(mean_eu, sd_eu) ## euploid enriched at 0 d_loh = NormalDistribution(mean_loh, sd_eu) ## loss of heterozygosity enriched at 1 d_aneu = NormalDistribution(mean_loh / 2.0, sd_eu * 1.4) ## aneuploid enriched at 1 ## define states s_eu = State(d_eu, name='EU') ## enriched at 0 s_loh = State(d_loh, name='LOH') ## enriched at 1 s_aneu = State(d_aneu, name='ANEU') ## enriched at 1 ## define model and pass in states model = HiddenMarkovModel() model.add_states(s_eu, s_loh, s_aneu) ## define transition matrix (state a, state b, probability) model.add_transition(model.start, s_eu, start_dip) model.add_transition(model.start, s_loh, 1.0 - start_dip - 0.1) model.add_transition(model.start, s_aneu, 0.1) model.add_transition(s_eu, s_eu, stay_state) model.add_transition(s_eu, s_loh, 1.0 - 4 * stay_state / 5 - 0.001) model.add_transition(s_eu, s_aneu, 1.0 - stay_state / 5 - 0.001) model.add_transition(s_eu, model.end, 0.002) model.add_transition(s_loh, s_loh, stay_state) model.add_transition(s_loh, s_eu, 1.0 - 4 * stay_state / 5 - 0.001) model.add_transition(s_loh, s_aneu, 1.0 - stay_state / 5 - 0.001) model.add_transition(s_loh, model.end, 0.002) model.add_transition(s_aneu, s_aneu, stay_state) model.add_transition(s_aneu, s_eu, 1.0 - stay_state / 2 - 0.001) model.add_transition(s_aneu, s_loh, 1.0 - stay_state / 2 - 0.001) model.add_transition(s_aneu, model.end, 0.002) ## finalize internal structure model.bake() ## only train transitions, not emissions model.freeze_distributions() return model
def ghmm_model(states_labels: tuple, transitions: tuple, init_prob: tuple, end_prob: tuple, means: list, vars: list) -> HiddenMarkovModel: """ :param states_labels: :param transitions: :param init_prob: :param end_prob: :param means: :param vars: :return: """ hmm_model = HiddenMarkovModel() mix_num = len(vars[0]) states = [] for state_i, state in enumerate(states_labels): mixture = [] for mix_i in range(mix_num): init_mean = means[state_i][mix_i] init_var = vars[state_i][mix_i] mixture.append(NormalDistribution(init_mean, init_var)) states.append(State(GeneralMixtureModel(mixture), name=str(state_i))) hmm_model.add_states(*tuple(states)) for row in range(len(states_labels)): for col in range(len(states_labels)): prob = transitions[row][col] if prob != 0.: hmm_model.add_transition(states[row], states[col], prob) for state_i, prob in enumerate(init_prob): if prob != 0.: hmm_model.add_transition(hmm_model.start, states[state_i], prob) for state_i, prob in enumerate(end_prob): if prob != 0.: hmm_model.add_transition(states[state_i], hmm_model.end, prob) hmm_model.bake() return hmm_model
def build_the_same_model_in_test_sample_from_site_line_by_line(): # State olds emission distribution, but not #transition distribution, because that's stored in the graph edges. s1 = State(NormalDistribution(5, 1)) s2 = State(NormalDistribution(1, 7)) s3 = State(NormalDistribution(8, 2)) model = HiddenMarkovModel() model.add_states(s1, s2, s3) model.add_transition(model.start, s1, 1.0) model.add_transition(s1, s1, 0.7) model.add_transition(s1, s2, 0.3) model.add_transition(s2, s2, 0.8) model.add_transition(s2, s3, 0.2) model.add_transition(s3, s3, 0.9) model.add_transition(s3, model.end, 0.1) model.bake() model.plot()
def buildHmm(minAmpliconLength, maxGap, windowSize): b_bkgd_1 = 0.1 a_interstate = b_bkgd_1**(2 * minAmpliconLength / windowSize) b_amp_0 = (a_interstate)**(0.5 * windowSize / maxGap) b_amp_1 = 1 - b_amp_0 b_bkgd_0 = 1 - b_bkgd_1 bkgdDist = DiscreteDistribution({0: b_bkgd_0, 1: b_bkgd_1}) ampDist = DiscreteDistribution({0: b_amp_0, 1: b_amp_1}) s_bkgd = State(bkgdDist, name='background') s_amp = State(ampDist, name='amplicon') hmm = HiddenMarkovModel() hmm.add_states(s_bkgd, s_amp) hmm.add_transition(hmm.start, s_bkgd, 1 - a_interstate) hmm.add_transition(hmm.start, s_amp, a_interstate) hmm.add_transition(s_bkgd, s_bkgd, 1 - a_interstate) hmm.add_transition(s_bkgd, s_amp, a_interstate) hmm.add_transition(s_amp, s_bkgd, a_interstate) hmm.add_transition(s_amp, s_amp, 1 - a_interstate) hmm.bake() return hmm
def create_hidden_MarkovModel(e_df, q_df, start_p_dict): """ Creates a Hidden Markov Model based on DataFrame @args: - e_df (pd.Dataframe): contains the emission probabilites - q_df (pd.Dataframe): contains the emission probabilites """ model = HiddenMarkovModel(name="Example Model") '#1: Create a dict for each key in trans. df' model_dict = {} for key in q_df.keys().values: model_dict[key] = {} '#2: Create the states' for key in model_dict: '#2.1.Step Add teh emission prob. to each state, , P(observation | state)' emission_p = DiscreteDistribution(e_df[key].to_dict()) sunny_state = State(emission_p, name=key) model_dict[key] = State(emission_p, name=key) model.add_state(model_dict[key]) '#2.2.Step: Add the start probability for each state' model.add_transition(model.start, model_dict[key], start_p_dict[key]) '#3.Step: Add the transition probability to each state' for key, item in q_df.to_dict("index").items(): for item_name, value in item.items(): print(key, " , ", item_name, ": ", value) tmp_origin = model_dict[key] tmp_destination = model_dict[item_name] model.add_transition(tmp_origin, tmp_destination, q_df.loc[key, item_name]) # finally, call the .bake() method to finalize the model model.bake() return model
def cluster(self): if self.preprocessed_data is None: print("No preprocessed_data attribute found") return -1 if self.alg == "Kmeans": from sklearn.cluster import KMeans km = KMeans(n_clusters=self.K, precompute_distances=True) km.fit(np.concatenate( self.preprocessed_data)) #flattens all dates together self.states = [km.predict(d) for d in self.preprocessed_data] elif self.alg == "HMM": from pomegranate import HiddenMarkovModel, MultivariateGaussianDistribution distribution = MultivariateGaussianDistribution hmm=HiddenMarkovModel().from_samples(distribution,n_components=self.K\ ,X=self.preprocessed_data.copy()) self.states = [ np.array(hmm.predict(d.copy())) for d in self.preprocessed_data ] else: print("Unrecognised or undefined clustering algorithm.") return -1 self.experiment_progress = 2
mfc_training_acc = accuracy(data.training_set.X, data.training_set.Y, mfc_model) print("training accuracy mfc_model: {:.2f}%".format(100 * mfc_training_acc)) mfc_testing_acc = accuracy(data.testing_set.X, data.testing_set.Y, mfc_model) print("testing accuracy mfc_model: {:.2f}%".format(100 * mfc_testing_acc)) # Calculate unigram_counts with a list of tag sequences from the training set tag_unigrams = unigram_counts(data.training_set.Y) # Calculate bigram_counts with a list of tag sequences from the training set tag_bigrams = bigram_counts(data.training_set.Y) # Calculate the count of each tag starting a sequence tag_starts = starting_counts(data.training_set.Y) # Calculate the count of each tag ending a sequence tag_ends = ending_counts(data.training_set.Y) basic_model = HiddenMarkovModel(name="base-hmm-tagger") # Create states with emission probability distributions P(word | tag) and add to the model tag_states = {} for tag in data.training_set.tagset: tag_emissions = DiscreteDistribution({ word: emission_counts[tag][word] / tag_unigrams[tag] for word in emission_counts[tag] }) tag_states[tag] = State(tag_emissions, name=tag) basic_model.add_state(tag_states[tag]) # Add edges between states for the observed transition frequencies P(tag_i | tag_i-1) for tag in data.training_set.tagset: basic_model.add_transition(basic_model.start, tag_states[tag],
if len(end_tag_counts) < len(data.training_set.tagset): for tag in data.training_set.tagset: if tag not in end_tag_counts: end_tag_counts[tag] = 0 ################## 5. COUNT NUMBER OF (TAG_i, WORD_i) PAIRS ################### ####################### pair_counts[tag][word] = k ########################## pair_counts = defaultdict(lambda: defaultdict(lambda: 0)) for sentence_idx, sentence in enumerate(data.training_set.Y): for word_idx, tag in enumerate(sentence): word = data.training_set.X[sentence_idx][word_idx] pair_counts[tag][word] += 1 ############################# 6. BUILD HMM MODEL ############################## HMM_model = HiddenMarkovModel(name="HMM-Tagger") tag_states = [] # state for each tag ################# (6.1) ADD STATES w/ EMISSION PROBABILITIES ################## ''' tag_emissions: P(word_i|tag_j) = P(word_i, tag_j)/P(tag_j) = C((word_i, tag_j) pairs)/C(tag_j) ''' for tag in data.training_set.tagset: tag_emissions = DiscreteDistribution({word:pair_counts[tag][word]/single_tag_counts[tag] \ for word in data.training_set.vocab}) tag_state = State(tag_emissions, name=tag) tag_states.append(tag_state) HMM_model.add_states(tag_state)
converted_total = [converter_to(x, 2) for x in total] matrixDonor0 = numpy.array(matrix_from_exa('new_donor1.exa')) c0, c1, c2 = calculator.calculate_proba2('cuts.txt') coding_state0 = State(DiscreteDistribution(c0.p), 'coding state 0') coding_state1 = State(DiscreteDistribution(c1.p), 'coding state 1') coding_state2 = State(DiscreteDistribution(c2.p), 'coding state 2') donor0_data = classify(matrixDonor0, 2) donor0_states = sequence_state_factory(donor0_data, 'donor0') post = State(DiscreteDistribution(equal_distribution), name='post') model = HiddenMarkovModel('codiing to donor') model.add_state(coding_state0) model.add_state(coding_state1) model.add_state(coding_state2) add_sequence(model, donor0_states) model.add_state(post) model.add_transition(model.start, coding_state0, 1) model.add_transition(coding_state0, coding_state1, 0.6) model.add_transition(coding_state0, donor0_states[0], 0.4) model.add_transition(coding_state1, coding_state2, 0.6)
matrix_GC = numpy.array(matrix_from_fasta('gc_completo.seq')) matrix_CCAAT = numpy.array(matrix_from_fasta('CCAAT_completa.seq')) matrix_Inr = numpy.array(matrix_from_fasta('Inr_completo.seq')) matrix_no_inr = numpy.array(matrix_from_fasta('no_inr.fa')) gc_data = classify(matrix_GC, 2) tata_data = classify(matrix_TATA, 2) cat_data = classify(matrix_CCAAT, 2) inr_data = classify(matrix_Inr, 2) no_inr_data = classify(matrix_no_inr, 2) no_coding = calculator.intron_calculator('cuts_intron.txt') # Model promoter_utr_model = HiddenMarkovModel('promoter') # States back = State(DiscreteDistribution(no_coding.p), name='back') gc_states = sequence_state_factory(gc_data, 'GC') post_gc_var_spacers_tss = spacer_states_maker(151, no_coding.p, 'post gc var spacer tss') post_gc_spacers_tss = spacer_states_maker(38, no_coding.p, 'post gc spacer tss') post_gc_var_spacers_tata = spacer_states_maker(151, no_coding.p, 'post gc var spacer tata') post_gc_spacers_tata = spacer_states_maker(18, no_coding.p, 'post gc spacer tata') cat_states = sequence_state_factory(cat_data, 'CAT') post_cat_var_spacers_tss = spacer_states_maker(151, no_coding.p, 'post cat var spacer tss') post_cat_spacers_tss = spacer_states_maker(42, no_coding.p, 'post cat spacer tss')
import numpy from pomegranate import State from pomegranate import DiscreteDistribution from pomegranate import HiddenMarkovModel import calculator from converter_to import converter_to from model_maker_utils import sequence_state_factory from model_maker_utils import classify from model_maker_utils import add_sequence from model_maker_utils import equal_distribution from matrix_from_aln import matrix_from_exa matrixAcceptor0 = numpy.array(matrix_from_exa('new_acceptor1.exa')) acceptor0_data = classify(matrixAcceptor0, 2) model = HiddenMarkovModel('intron_acceptor') intron = State(DiscreteDistribution( calculator.intron_calculator('cuts_intron.txt').p), name='in') acceptor0_states = sequence_state_factory(acceptor0_data, 'acceptor0') post = State(DiscreteDistribution(equal_distribution), name='post') model.add_state(intron) add_sequence(model, acceptor0_states) model.add_state(post) model.add_transition(model.start, intron, 1) model.add_transition(intron, intron, 0.9) model.add_transition(intron, acceptor0_states[0], 0.1) model.add_transition(acceptor0_states[-1], post, 1)
donor1_data = classify(matrixDonor1, 2) donor1_states = sequence_state_factory(donor1_data, 'donor1') donor2_data = classify(matrixDonor2, 2) donor2_states = sequence_state_factory(donor2_data, 'donor2') acceptor0_data = classify(matrixAcceptor0, 2) acceptor0_states = sequence_state_factory(acceptor0_data, 'acceptor0') acceptor1_data = classify(matrixAcceptor1, 2) acceptor1_states = sequence_state_factory(acceptor1_data, 'acceptor1') acceptor2_data = classify(matrixAcceptor2, 2) acceptor2_states = sequence_state_factory(acceptor2_data, 'acceptor2') coding_model = HiddenMarkovModel() intron_distribution = calculator.intron_calculator('cuts_intron.txt') back = State(DiscreteDistribution( calculator.intron_calculator('cuts_intron.txt').p), name='back') fake_back = State(DiscreteDistribution(intron_distribution.p), name='back2') in0 = State(DiscreteDistribution(intron_distribution.p), name='in0') in1 = State(DiscreteDistribution(intron_distribution.p), name='in1') in2 = State(DiscreteDistribution(intron_distribution.p), name='in2') in0_spacers = spacer_states_maker(64, intron_distribution.p, 'in0 spacer') in1_spacers = spacer_states_maker(64, intron_distribution.p, 'in1 spacer') in2_spacers = spacer_states_maker(64, intron_distribution.p, 'in2 spacer')
import pomegranate # import python modules -- this cell needs to be run again if you make changes to any of the files import matplotlib.pyplot as plt import numpy as np from helpers import show_model from pomegranate import State, HiddenMarkovModel, DiscreteDistribution # create the HMM model model = HiddenMarkovModel(name="Example Model") # create the HMM model model = HiddenMarkovModel(name="Example Model") # emission probability distributions, P(umbrella | weather) sunny_emissions = DiscreteDistribution({"yes": 0.1, "no": 0.9}) sunny_state = State(sunny_emissions, name="Sunny") # TODO: create a discrete distribution for the rainy emissions from the probability table # above & use that distribution to create a state named Rainy rainy_emissions = DiscreteDistribution({"yes": 0.8, "no": 0.2}) rainy_state = State(rainy_emissions, name="Rainy") # add the states to the model
def __init__(self): self.model = HiddenMarkovModel()
def crop_type_hmm_model(nn_pobability_matrix, timeseries_steps, n_observed_classes): # 0 1 2 3 4 5 [ 'unknown_plant', 'large_grass', 'small_grass', 'other', 'fallow', 'no_crop' ] d0 = NeuralNetworkWrapperCustom( predicted_probabilities=nn_pobability_matrix, i=0, n_samples=timeseries_steps, n_classes=n_observed_classes) d1 = NeuralNetworkWrapperCustom( predicted_probabilities=nn_pobability_matrix, i=1, n_samples=timeseries_steps, n_classes=n_observed_classes) d2 = NeuralNetworkWrapperCustom( predicted_probabilities=nn_pobability_matrix, i=2, n_samples=timeseries_steps, n_classes=n_observed_classes) d3 = NeuralNetworkWrapperCustom( predicted_probabilities=nn_pobability_matrix, i=3, n_samples=timeseries_steps, n_classes=n_observed_classes) d4 = NeuralNetworkWrapperCustom( predicted_probabilities=nn_pobability_matrix, i=4, n_samples=timeseries_steps, n_classes=n_observed_classes) d5 = NeuralNetworkWrapperCustom( predicted_probabilities=nn_pobability_matrix, i=5, n_samples=timeseries_steps, n_classes=n_observed_classes) s0_unk = State(d0, name='unknown_plant') s1_large = State(d1, name='large_grass') s2_small = State(d2, name='small_grass') s3_other = State(d3, name='other') s4_fallow = State(d4, name='fallow') s5_none = State(d5, name='no_crop') model = HiddenMarkovModel() # Initialize each hidden state. # All states have an equal chance of being the starting state. for s in [s0_unk, s1_large, s2_small, s3_other, s4_fallow, s5_none]: model.add_state(s) model.add_transition(model.start, s, 1) model.add_transitions( s0_unk, [s0_unk, s1_large, s2_small, s3_other, s4_fallow, s5_none], [95., 0., 0., 0., 0., 5.]) model.add_transitions( s1_large, [s0_unk, s1_large, s2_small, s3_other, s4_fallow, s5_none], [0., 95., 0., 0., 0., 5.]) model.add_transitions( s2_small, [s0_unk, s1_large, s2_small, s3_other, s4_fallow, s5_none], [0., 0., 95., 0., 0., 5.]) model.add_transitions( s3_other, [s0_unk, s1_large, s2_small, s3_other, s4_fallow, s5_none], [0., 0., 0., 95., 0., 5.]) model.add_transitions( s4_fallow, [s0_unk, s1_large, s2_small, s3_other, s4_fallow, s5_none], [0., 0., 0., 0., 95., 5.]) model.add_transitions( s5_none, [s0_unk, s1_large, s2_small, s3_other, s4_fallow, s5_none], [2., 2., 2., 2., 2., 90.]) model.bake(verbose=False) return model
def train_and_test(): with open('../data extractors/exons_start_1.txt') as in_file: total = [] for line in in_file: no_p_line = line.replace('P', '').lower().replace('\n', '') total.append(no_p_line) converted_total = [converter_to(x, 2) for x in total] matrixDonor0 = numpy.array( matrix_from_exa('../data extractors/new_donor1.exa')) c0, c1, c2 = calculator.calculate_proba2('../data extractors/new_cuts.txt') print(c0.p, c1.p, c2.p) coding_state0 = State(DiscreteDistribution(c0.p), 'coding state 0') coding_state1 = State(DiscreteDistribution(c1.p), 'coding state 1') coding_state2 = State(DiscreteDistribution(c2.p), 'coding state 2') donor0_data = classify(matrixDonor0, 2) donor0_states = sequence_state_factory(donor0_data, 'donor0') post = State(DiscreteDistribution(equal_distribution), name='post') model = HiddenMarkovModel('coding to donor') model.add_state(coding_state0) model.add_state(coding_state1) model.add_state(coding_state2) add_sequence(model, donor0_states) model.add_state(post) model.add_transition(model.start, coding_state0, 1) model.add_transition(coding_state0, coding_state1, 0.6) model.add_transition(coding_state0, donor0_states[0], 0.4) model.add_transition(coding_state1, coding_state2, 0.6) model.add_transition(coding_state1, donor0_states[0], 0.4) model.add_transition(coding_state2, coding_state0, 0.6) model.add_transition(coding_state2, donor0_states[0], 0.4) model.add_transition(donor0_states[-1], post, 1) model.add_transition(post, post, 0.9) model.add_transition(post, model.end, 0.1) model.bake() test_model(model) model.fit(converted_total, transition_pseudocount=1, emission_pseudocount=1, verbose=True) test_model(model) with open('partial_model_coding_to_donor_model0.json', 'w') as out: out.write(model.to_json())
from pomegranate import DiscreteDistribution from pomegranate import HiddenMarkovModel import calculator from model_maker_utils import sequence_state_factory, classify, add_sequence, equal_distribution from matrix_from_aln import matrix_from_exa from converter_to import converter_to c0, c1, c2 = calculator.calculate_proba2('../data extractors/new_cuts.txt') matrixStop = numpy.array(matrix_from_exa('../data extractors/new_stops.exa')) coding_state0 = State(DiscreteDistribution(c0.p), 'coding state 0') coding_state1 = State(DiscreteDistribution(c1.p), 'coding state 1') coding_state2 = State(DiscreteDistribution(c2.p), 'coding state 2') post = State(DiscreteDistribution(equal_distribution), name='post') model = HiddenMarkovModel('coding_to_stop') stop_data = classify(matrixStop, 2) stop_states = sequence_state_factory(stop_data, 'stop') model.add_state(coding_state0) model.add_state(coding_state1) model.add_state(coding_state2) add_sequence(model, stop_states) model.add_state(post) model.add_transition(model.start, coding_state1, 1) model.add_transition(coding_state0, coding_state1, 1) model.add_transition(coding_state1, coding_state2, 1)
def crop_status_hmm_model(nn_pobability_matrix, timeseries_steps, n_observed_classes): # 0 1 2 3 4 5 ['emergence', 'growth', 'flowers', 'senescing', 'senesced', 'no_crop'] d0 = NeuralNetworkWrapperCustom( predicted_probabilities=nn_pobability_matrix, i=0, n_samples=timeseries_steps, n_classes=n_observed_classes) d1 = NeuralNetworkWrapperCustom( predicted_probabilities=nn_pobability_matrix, i=1, n_samples=timeseries_steps, n_classes=n_observed_classes) d2 = NeuralNetworkWrapperCustom( predicted_probabilities=nn_pobability_matrix, i=2, n_samples=timeseries_steps, n_classes=n_observed_classes) d3 = NeuralNetworkWrapperCustom( predicted_probabilities=nn_pobability_matrix, i=3, n_samples=timeseries_steps, n_classes=n_observed_classes) d4 = NeuralNetworkWrapperCustom( predicted_probabilities=nn_pobability_matrix, i=4, n_samples=timeseries_steps, n_classes=n_observed_classes) d5 = NeuralNetworkWrapperCustom( predicted_probabilities=nn_pobability_matrix, i=5, n_samples=timeseries_steps, n_classes=n_observed_classes) s0_emerge = State(d0, name='emergence') s1_growth = State(d1, name='growth') s2_fls = State(d2, name='flowers') s3_sencing = State(d3, name='senescing') s4_senced = State(d4, name='senesced') s5_none = State(d5, name='no_crop') model = HiddenMarkovModel() # Initialize each hidden state. # All states have an equal chance of being the starting state. for s in [s0_emerge, s1_growth, s2_fls, s3_sencing, s4_senced, s5_none]: model.add_state(s) model.add_transition(model.start, s, 1) model.add_transitions( s0_emerge, [s0_emerge, s1_growth, s2_fls, s3_sencing, s4_senced, s5_none], [90., 5., 0., 0., 0., 5.]) model.add_transitions( s1_growth, [s0_emerge, s1_growth, s2_fls, s3_sencing, s4_senced, s5_none], [0., 90., 2.5, 2.5, 0., 5.]) model.add_transitions( s2_fls, [s0_emerge, s1_growth, s2_fls, s3_sencing, s4_senced, s5_none], [0., 0., 90., 5., 0., 5.]) model.add_transitions( s3_sencing, [s0_emerge, s1_growth, s2_fls, s3_sencing, s4_senced, s5_none], [0., 0., 0., 90., 5., 5.]) model.add_transitions( s4_senced, [s0_emerge, s1_growth, s2_fls, s3_sencing, s4_senced, s5_none], [0., 0., 0., 0., 90., 10.]) model.add_transitions( s5_none, [s0_emerge, s1_growth, s2_fls, s3_sencing, s4_senced, s5_none], [10., 0, 0., 0., 0., 90.]) model.bake(verbose=False) return model
def _segment(self, arr, components=2): nonzero = arr[arr > 0] idx = self.hampel_filter(np.log2(nonzero)) filtered = nonzero[idx] log_gmm = self.get_states(np.log2(filtered)) log_means, log_probs = log_gmm.means_.ravel(), log_gmm.weights_ ln_gmm = self.get_states(filtered) # to improve the sensitivity ln_means, ln_probs = ln_gmm.means_.ravel(), ln_gmm.weights_ if (len(log_means) == 1): means, probs = ln_means, ln_probs scale = 'linear' else: means, probs = log_means, log_probs scale = 'log' logger.info('Estimated HMM state number: {0} ({1} scale)'.format(len(means), scale)) model = HiddenMarkovModel() # GMM emissions dists = [] for m in means: tmp = [] for i in range(components): e = m + (-1)**i * ((i+1)//2) * 0.5 s = 0.5 tmp.append(NormalDistribution(e, s)) mixture = State(GeneralMixtureModel(tmp), name=str(m)) dists.append(mixture) model.add_states(*tuple(dists)) # transition matrix for i in range(len(means)): for j in range(len(means)): if i==j: model.add_transition(dists[i], dists[j], 0.8) else: model.add_transition(dists[i], dists[j], 0.2/(len(means)-1)) # starts and ends for i in range(len(means)): model.add_transition(model.start, dists[i], probs[i]) model.bake() # training sequences tmp = np.zeros(nonzero.size) tmp[idx] = filtered newarr = np.zeros(arr.size) newarr[arr > 0] = tmp if len(means) > 1: model.fit(self.pieces(newarr, scale=scale), algorithm='baum-welch', n_jobs=self.n_jobs, max_iterations=5000, stop_threshold=2e-4) queue = newarr[newarr > 0] if scale=='log': seq = np.r_[[s.name for i, s in model.viterbi(np.log2(queue))[1][1:]]] else: seq = np.r_[[s.name for i, s in model.viterbi(queue)[1][1:]]] seg = self.assign_cnv(queue, seq) predicted = np.zeros(newarr.size) predicted[newarr > 0] = seg seg = self.call_intervals(predicted) else: seg = [(0, newarr.size)] return newarr, seg, scale
from pathlib import Path from xml.etree import ElementTree from gene_ebi_to_string import to_string from pomegranate import HiddenMarkovModel from pomegranate import State from pomegranate import DiscreteDistribution from converter_to import converter_to hmmodel = HiddenMarkovModel() back_state = State(DiscreteDistribution({ 'a': 0.25, 'c': 0.25, 'g': 0.25, 't': 0.25 }), name='back') fixed_state = State(DiscreteDistribution({ 'a': 0.45, 'c': 0.45, 'g': 0.05, 't': 0.05 }), name='fixed') hmmodel.add_state(back_state) hmmodel.add_state(fixed_state) hmmodel.add_transition(hmmodel.start, back_state, 1) hmmodel.add_transition(back_state, back_state, 0.9)
def dominant_cover_hmm_model(nn_pobability_matrix, timeseries_steps, n_observed_classes): d0 = NeuralNetworkWrapperCustom( predicted_probabilities=nn_pobability_matrix, i=0, n_samples=timeseries_steps, n_classes=n_observed_classes) d1 = NeuralNetworkWrapperCustom( predicted_probabilities=nn_pobability_matrix, i=1, n_samples=timeseries_steps, n_classes=n_observed_classes) d2 = NeuralNetworkWrapperCustom( predicted_probabilities=nn_pobability_matrix, i=2, n_samples=timeseries_steps, n_classes=n_observed_classes) d3 = NeuralNetworkWrapperCustom( predicted_probabilities=nn_pobability_matrix, i=3, n_samples=timeseries_steps, n_classes=n_observed_classes) d4 = NeuralNetworkWrapperCustom( predicted_probabilities=nn_pobability_matrix, i=4, n_samples=timeseries_steps, n_classes=n_observed_classes) s0_veg = State(d0, name='vegetation') s1_residue = State(d1, name='residue') s2_soil = State(d2, name='soil') s3_snow = State(d3, name='snow') s4_water = State(d4, name='water') model = HiddenMarkovModel() # Initialize each hidden state. # All states have an equal chance of being the starting state. for s in [s0_veg, s1_residue, s2_soil, s3_snow, s4_water]: model.add_state(s) model.add_transition(model.start, s, 1) model.add_transitions(s0_veg, [s0_veg, s1_residue, s2_soil, s3_snow, s4_water], [95., 1.0, 1.0, 1.0, 1.0]) model.add_transitions(s1_residue, [s0_veg, s1_residue, s2_soil, s3_snow, s4_water], [1.0, 95., 1.0, 1.0, 1.0]) model.add_transitions(s2_soil, [s0_veg, s1_residue, s2_soil, s3_snow, s4_water], [1.0, 1.0, 95., 1.0, 1.0]) model.add_transitions(s3_snow, [s0_veg, s1_residue, s2_soil, s3_snow, s4_water], [1.0, 1.0, 1.0, 95., 1.0]) model.add_transitions(s4_water, [s0_veg, s1_residue, s2_soil, s3_snow, s4_water], [1.0, 1.0, 1.0, 1.0, 95.]) model.bake(verbose=False) return model