Esempio n. 1
0
    def update_hmm(self):
        num_states = self.num_states
        start_prob = self.start_prob
        num_emissions = self.num_emissions

        hmm = HiddenMarkovModel('hmm')
        dist = [
            DiscreteDistribution(
                dict(zip(range(num_emissions), self.emissions[i])))
            for i in range(num_states)
        ]
        states = [
            State(dist[i], 's' + str(i).zfill(2)) for i in range(num_states)
        ]
        hmm.add_states(states)
        for i in range(num_states):
            s_i = states[i]
            hmm.add_transition(hmm.start, s_i, start_prob[i])
            for j in range(num_states):
                s_j = states[j]
                p = self.transitions[i, j]
                hmm.add_transition(s_i, s_j, p)

        self.hmm = hmm
        self.hmm.bake()
Esempio n. 2
0
def load_segmentation_model(modeldata):
    model = HiddenMarkovModel('model')

    states = {}
    for s in modeldata:
        if len(s['emission']) == 1:
            emission = NormalDistribution(*s['emission'][0][:2])
        else:
            weights = np.array([w for _, _, w in s['emission']])
            dists = [NormalDistribution(mu, sigma)
                     for mu, sigma, _ in s['emission']]
            emission = GeneralMixtureModel(dists, weights=weights)
        state = State(emission, name=s['name'])

        states[s['name']] = state
        model.add_state(state)
        if 'start_prob' in s:
            model.add_transition(model.start, state, s['start_prob'])

    for s in modeldata:
        current = states[s['name']]
        for nextstate, prob in s['transition']:
            model.add_transition(current, states[nextstate], prob)

    model.bake()

    return model
Esempio n. 3
0
    def oriHMMParams(self):
        """
        Set initial parameters for the Hidden Markov Model (HMM).
        
        Attributes
        ----------
        HMMParams : dict
            Has 3 keys: "A", state transition matrix, "B" (emission probabilities),
            specifying parameters (Means, Variances, Weights) of the mixture
            Gaussian distributions for each hidden state, and "pi", indicating
            the hidden state weights. This dict will be updated after learning
            procedure.
        """
        hmm = HiddenMarkovModel()
        # GMM emissions
        # 5 Hidden States:
        # 0--start, 1--downstream, 2--no bias, 3--upstream, 4--end
        numdists = 3  # Three-distribution Gaussian Mixtures
        var = 7.5 / (numdists - 1)
        means = [[], [], [], [], []]
        for i in range(numdists):
            means[4].append(i * 7.5 / (numdists - 1) + 2.5)
            means[3].append(i * 7.5 / (numdists - 1))
            means[2].append((i - (numdists - 1) / 2) * 7.5 / (numdists - 1))
            means[1].append(-i * 7.5 / (numdists - 1))
            means[0].append(-i * 7.5 / (numdists - 1) - 2.5)
        states = []
        for i, m in enumerate(means):
            tmp = []
            for j in m:
                tmp.append(NormalDistribution(j, var))
            mixture = GeneralMixtureModel(tmp)
            states.append(State(mixture, name=str(i)))
        hmm.add_states(*tuple(states))

        # Transmission matrix
        #A = [[0., 1., 0., 0., 0.],
        #    [0., 0.4, 0.3, 0.3, 0.],
        #    [0.05, 0., 0.5, 0.45, 0.],
        #    [0., 0., 0., 0.5, 0.5],
        #    [0.99, 0., 0.01, 0., 0.]]
        hmm.add_transition(states[0], states[1], 1)
        hmm.add_transition(states[1], states[1], 0.4)
        hmm.add_transition(states[1], states[2], 0.3)
        hmm.add_transition(states[1], states[3], 0.3)
        hmm.add_transition(states[2], states[0], 0.05)
        hmm.add_transition(states[2], states[2], 0.5)
        hmm.add_transition(states[2], states[3], 0.45)
        hmm.add_transition(states[3], states[3], 0.5)
        hmm.add_transition(states[3], states[4], 0.5)
        hmm.add_transition(states[4], states[0], 0.99)
        hmm.add_transition(states[4], states[2], 0.01)

        pi = [0.05, 0.3, 0.3, 0.3, 0.05]
        for i in range(len(states)):
            hmm.add_transition(hmm.start, states[i], pi[i])

        hmm.bake()

        return hmm
Esempio n. 4
0
def build_an_hmm_example():
    # i think the characters in each DiscreteDistribution definition, means the emission matrix for each state
    # because it says the probability of seeing each character when the system is in that state
    d1 = DiscreteDistribution({'A': 0.35, 'C': 0.20, 'G': 0.05, 'T': 0.40})
    d2 = DiscreteDistribution({'A': 0.25, 'C': 0.25, 'G': 0.25, 'T': 0.25})
    d3 = DiscreteDistribution({'A': 0.10, 'C': 0.40, 'G': 0.40, 'T': 0.10})

    s1 = State(d1, name="s1")
    s2 = State(d2, name="s2")
    s3 = State(d3, name="s3")

    model = HiddenMarkovModel('example')
    model.add_states([s1, s2, s3])
    model.add_transition(model.start, s1, 0.90)
    model.add_transition(model.start, s2, 0.10)
    model.add_transition(s1, s1, 0.80)
    model.add_transition(s1, s2, 0.20)
    model.add_transition(s2, s2, 0.90)
    model.add_transition(s2, s3, 0.10)
    model.add_transition(s3, s3, 0.70)
    model.add_transition(s3, model.end, 0.30)
    model.bake()

    for i in range(len(model.states)):
        print(model.states[i].name)
    model.plot()
    #print(model.log_probability(list('ACGACTATTCGAT')))

    #print(", ".join(state.name for i, state in model.viterbi(list('ACGACTATTCGAT'))[1]))

    print("forward:", model.forward(list('ACG')))
Esempio n. 5
0
def bake_model(tags_sequence, words_sequence):
    """
    'tags' are the time-demand labels that generate the emitted demand level.
    Demand level are represented by 'words'
    """
    # rdemand
    words = [x for x in chain(*words_sequence)]
    tag_unigrams = unigram_counts(words)
    tag_bigrams = bigram_counts(words)

    # Uniform distribution for starting and ending labels
    all_labels = list(set(words))
    tag_starts = starting_counts(all_labels)
    tag_ends = ending_counts(all_labels)

    basic_model = HiddenMarkovModel(name="base-hmm-tagger")

    # Emission count
    label_train = tags_sequence
    rdemand_train = words_sequence
    emission_count = pair_counts(rdemand_train, label_train)

    # States with emission probability distributions P(word | tag)
    states = []
    for rdemand, label_dict in emission_count.items():
        dist_tag = DiscreteDistribution({
            label: cn / tag_unigrams[rdemand]
            for label, cn in label_dict.items()
        })
        states.append(State(dist_tag, name=rdemand))

    basic_model.add_states(states)
    state_names = [s.name for s in states]
    state_index = {tag: num for num, tag in enumerate(state_names)}

    # Start transition
    total_start = sum(tag_starts.values())
    for tag, cn in tag_starts.items():
        # sname = state_index[tag]
        basic_model.add_transition(basic_model.start, states[state_index[tag]],
                                   cn / total_start)

    # End transition
    total_end = sum(tag_ends.values())
    for tag, cn in tag_ends.items():
        basic_model.add_transition(states[state_index[tag]], basic_model.end,
                                   cn / total_end)

    # Edges between states for the observed transition frequencies P(tag_i | tag_i-1)
    for key, value in tag_bigrams.items():
        basic_model.add_transition(states[state_index[key[0]]],
                                   states[state_index[key[1]]],
                                   value / tag_unigrams[key[0]])

    # Finalize the model
    basic_model.bake()

    return basic_model
Esempio n. 6
0
def hmmer2pom(hmm):
    # set up environment
    from math import exp
    from pomegranate import DiscreteDistribution,HiddenMarkovModel,State
    tags = dict(); header = 0; alphabet = None; hmmlines = list()

    # parse HMMER file
    for line in hmm.splitlines():
        l = line.strip()
        if len(l) == 0 or l[0] == '#':
            continue
        elif header == 0:
            if l.startswith('HMM') and l[3] != 'E': # beginning of actual HMM
                header = 1; alphabet = l.split()[1:]
            else:
                parts = l.strip().split()
                if parts[0] in tags:
                    if not isinstance(tags[parts[0]], list):
                        tags[parts[0]] = [tags[parts[0]]]
                    tags[parts[0]].append(' '.join(parts[1:]))
                else:
                    tags[parts[0]] = ' '.join(parts[1:])
        elif header == 1:
            header = 2
        else:
            if l.startswith('COMPO'):
                parts = l.strip().split(); tags[parts[0]] = ' '.join(parts[1:])
            else:
                hmmlines.append(l)

    # create all states
    model = HiddenMarkovModel(tags['NAME']); tmpstates = list(); K = 0
    i_emit = hmmlines[0].split(); tmpstates.append(State(DiscreteDistribution({alphabet[i] : exp(-1*float(i_emit[i])) for i in range(len(alphabet))}), name="I0")) # insertion state
    for l in range(2,len(hmmlines),3):
        m_emit,i_emit,state_trans = [hmmlines[l+i].split() for i in range(0,3)]; K = int(m_emit[0])
        tmpstates.append(State(DiscreteDistribution({alphabet[i] : exp(-1*float(m_emit[i+1])) for i in range(len(alphabet))}), name="M%d" % K)) # match state
        tmpstates.append(State(DiscreteDistribution({alphabet[i] : exp(-1*float(i_emit[i])) for i in range(len(alphabet))}), name="I%d" % K)) # insertion state
        tmpstates.append(State(None, name="D%d" % K)) # deletion state
    assert K != 0, "No match states in profile HMM"
    model.add_states(tmpstates); name2state = {state.name:state for state in tmpstates}; name2state["M0"] = model.start; name2state["M%d"%(K+1)] = model.end

    # create all transitions
    for l in range(1,len(hmmlines),3):
        k = int(l/3); parts = hmmlines[l].split()
        model.add_transition(name2state["M%d"%k], name2state["M%d"%(k+1)], exp(-1*float(parts[0])))     # 0: M_k -> M_k+1
        model.add_transition(name2state["M%d"%k], name2state["I%d"%k],     exp(-1*float(parts[1])))     # 1: M_k -> I_k
        if parts[2] != '*': # no D_k+1 in last row
            model.add_transition(name2state["M%d"%k], name2state["D%d"%(k+1)], exp(-1*float(parts[2]))) # 2: M_k -> D_k+1
        model.add_transition(name2state["I%d"%k], name2state["M%d"%(k+1)], exp(-1*float(parts[3])))     # 3: I_k -> M_k+1
        model.add_transition(name2state["I%d"%k], name2state["I%d"%k],     exp(-1*float(parts[4])))     # 4: I_k -> I_k
        if k != 0: # no D0 state
            model.add_transition(name2state["D%d"%k], name2state["M%d"%(k+1)], exp(-1*float(parts[5]))) # 5: D_k -> M_k+1
        if parts[6] != '*': # no D0 state and no D_k+1 in last row
            model.add_transition(name2state["D%d"%k], name2state["D%d"%(k+1)], exp(-1*float(parts[6]))) # 6: D_k -> D_k+1
    model.bake()
    return model.to_json()
Esempio n. 7
0
def train_hmm_tagger(data):
    # HMM
    # Use the tag unigrams and bigrams calculated above to construct a hidden Markov tagger.
    #
    # - Add one state per tag
    #     - The emission distribution at each state should be estimated with the formula: $P(w|t) = \frac{C(t, w)}{C(t)}$
    # - Add an edge from the starting state `basic_model.start` to each tag
    #     - The transition probability should be estimated with the formula: $P(t|start) = \frac{C(start, t)}{C(start)}$
    # - Add an edge from each tag to the end state `basic_model.end`
    #     - The transition probability should be estimated with the formula: $P(end|t) = \frac{C(t, end)}{C(t)}$
    # - Add an edge between _every_ pair of tags
    #     - The transition probability should be estimated with the formula: $P(t_2|t_1) = \frac{C(t_1, t_2)}{C(t_1)}$
    basic_model = HiddenMarkovModel(name="base-hmm-tagger")

    state_dict = {}
    states = []
    emission_counts = pair_counts(*list(zip(
        *data.training_set.stream()))[::-1])
    for tag in emission_counts.keys():
        tag_count = tag_unigrams[tag]
        probs = {}
        for w in emission_counts[tag]:
            probs[w] = emission_counts[tag][w] / tag_count
        emission_p = DiscreteDistribution(probs)
        state = State(emission_p, name="" + tag)
        basic_model.add_state(state)
        state_dict[tag] = state

    for tag in tag_starts:
        basic_model.add_transition(basic_model.start, state_dict[tag],
                                   tag_starts[tag] / len(data.training_set.Y))
        basic_model.add_transition(state_dict[tag], basic_model.end,
                                   tag_ends[tag] / tag_unigrams[tag])

    for (tag1, tag2) in tag_bigrams:
        basic_model.add_transition(
            state_dict[tag1], state_dict[tag2],
            tag_bigrams[(tag1, tag2)] / tag_unigrams[tag1])

    # finalize the model
    basic_model.bake()

    assert all(
        tag in set(s.name for s in basic_model.states)
        for tag in data.training_set.tagset
    ), "Every state in your network should use the name of the associated tag, which must be one of the training set tags."
    assert basic_model.edge_count() == 168, (
        "Your network should have an edge from the start node to each state, one edge between every "
        +
        "pair of tags (states), and an edge from each state to the end node.")
    HTML(
        '<div class="alert alert-block alert-success">Your HMM network topology looks good!</div>'
    )
    return basic_model
Esempio n. 8
0
def insert_delete_main_hmm(data_matrix):
    v_columns = column_clasify(data_matrix)
    v_zones = create_zones(v_columns)
    v_grouped_states = group_states(v_zones, 'test')
    v_model = HiddenMarkovModel()
    v_first_state = State(None, name='ali_start')
    v_last_state = State(None, name='ali_end')
    v_model.add_state(v_first_state)
    v_model.add_transition(v_model.start, v_first_state, 1)
    v_model.add_state(v_last_state)
    add_states(v_model, v_grouped_states)
    v_trans = calculate_transitions(v_first_state, v_last_state,
                                    v_grouped_states)
    apply_transitions(v_model, v_trans)
    v_model.bake()
    return v_model
Esempio n. 9
0
def _initialize_new_hmm(hmm, new_states, new_transitions):

    new_hmm = HiddenMarkovModel()
    for state in new_states:
        if state not in (hmm.start, hmm.end):
            new_hmm.add_state(state)
    for source_state, target_state, probability in new_transitions:
        if source_state != hmm.start and target_state != hmm.end:
            new_hmm.add_transition(source_state, target_state, probability)
        elif source_state == hmm.start:
            new_hmm.add_transition(new_hmm.start, target_state, probability)
        elif target_state == hmm.end:
            new_hmm.add_transition(source_state, new_hmm.end, probability)

    new_hmm.bake()
    return new_hmm
Esempio n. 10
0
def init_model(start_dip, stay_state, mean_eu, sd_eu, mean_loh):

    ## define distributions
    d_eu = NormalDistribution(mean_eu, sd_eu)  ## euploid enriched at 0
    d_loh = NormalDistribution(mean_loh,
                               sd_eu)  ## loss of heterozygosity enriched at 1
    d_aneu = NormalDistribution(mean_loh / 2.0,
                                sd_eu * 1.4)  ## aneuploid enriched at 1

    ## define states
    s_eu = State(d_eu, name='EU')  ## enriched at 0
    s_loh = State(d_loh, name='LOH')  ## enriched at 1
    s_aneu = State(d_aneu, name='ANEU')  ## enriched at 1

    ## define model and pass in states
    model = HiddenMarkovModel()
    model.add_states(s_eu, s_loh, s_aneu)

    ## define transition matrix (state a, state b, probability)
    model.add_transition(model.start, s_eu, start_dip)
    model.add_transition(model.start, s_loh, 1.0 - start_dip - 0.1)
    model.add_transition(model.start, s_aneu, 0.1)

    model.add_transition(s_eu, s_eu, stay_state)
    model.add_transition(s_eu, s_loh, 1.0 - 4 * stay_state / 5 - 0.001)
    model.add_transition(s_eu, s_aneu, 1.0 - stay_state / 5 - 0.001)
    model.add_transition(s_eu, model.end, 0.002)

    model.add_transition(s_loh, s_loh, stay_state)
    model.add_transition(s_loh, s_eu, 1.0 - 4 * stay_state / 5 - 0.001)
    model.add_transition(s_loh, s_aneu, 1.0 - stay_state / 5 - 0.001)
    model.add_transition(s_loh, model.end, 0.002)

    model.add_transition(s_aneu, s_aneu, stay_state)
    model.add_transition(s_aneu, s_eu, 1.0 - stay_state / 2 - 0.001)
    model.add_transition(s_aneu, s_loh, 1.0 - stay_state / 2 - 0.001)
    model.add_transition(s_aneu, model.end, 0.002)

    ## finalize internal structure
    model.bake()
    ## only train transitions, not emissions
    model.freeze_distributions()

    return model
Esempio n. 11
0
def ghmm_model(states_labels: tuple,
               transitions: tuple,
               init_prob: tuple,
               end_prob: tuple,
               means: list,
               vars: list) -> HiddenMarkovModel:
    """

    :param states_labels:
    :param transitions:
    :param init_prob:
    :param end_prob:
    :param means:
    :param vars:
    :return:
    """
    hmm_model = HiddenMarkovModel()

    mix_num = len(vars[0])
    states = []
    for state_i, state in enumerate(states_labels):
        mixture = []
        for mix_i in range(mix_num):
            init_mean = means[state_i][mix_i]
            init_var = vars[state_i][mix_i]
            mixture.append(NormalDistribution(init_mean, init_var))
        states.append(State(GeneralMixtureModel(mixture), name=str(state_i)))
    hmm_model.add_states(*tuple(states))

    for row in range(len(states_labels)):
        for col in range(len(states_labels)):
            prob = transitions[row][col]
            if prob != 0.:
                hmm_model.add_transition(states[row], states[col], prob)
    for state_i, prob in enumerate(init_prob):
        if prob != 0.:
            hmm_model.add_transition(hmm_model.start, states[state_i], prob)
    for state_i, prob in enumerate(end_prob):
        if prob != 0.:
            hmm_model.add_transition(states[state_i], hmm_model.end, prob)

    hmm_model.bake()

    return hmm_model
Esempio n. 12
0
def build_the_same_model_in_test_sample_from_site_line_by_line():

    # State olds emission distribution, but not
    #transition distribution, because that's stored in the graph edges.
    s1 = State(NormalDistribution(5, 1))
    s2 = State(NormalDistribution(1, 7))
    s3 = State(NormalDistribution(8, 2))
    model = HiddenMarkovModel()
    model.add_states(s1, s2, s3)
    model.add_transition(model.start, s1, 1.0)
    model.add_transition(s1, s1, 0.7)
    model.add_transition(s1, s2, 0.3)
    model.add_transition(s2, s2, 0.8)
    model.add_transition(s2, s3, 0.2)
    model.add_transition(s3, s3, 0.9)
    model.add_transition(s3, model.end, 0.1)
    model.bake()

    model.plot()
Esempio n. 13
0
def buildHmm(minAmpliconLength, maxGap, windowSize):
    b_bkgd_1 = 0.1
    a_interstate = b_bkgd_1**(2 * minAmpliconLength / windowSize)
    b_amp_0 = (a_interstate)**(0.5 * windowSize / maxGap)
    b_amp_1 = 1 - b_amp_0
    b_bkgd_0 = 1 - b_bkgd_1
    bkgdDist = DiscreteDistribution({0: b_bkgd_0, 1: b_bkgd_1})
    ampDist = DiscreteDistribution({0: b_amp_0, 1: b_amp_1})
    s_bkgd = State(bkgdDist, name='background')
    s_amp = State(ampDist, name='amplicon')
    hmm = HiddenMarkovModel()
    hmm.add_states(s_bkgd, s_amp)
    hmm.add_transition(hmm.start, s_bkgd, 1 - a_interstate)
    hmm.add_transition(hmm.start, s_amp, a_interstate)
    hmm.add_transition(s_bkgd, s_bkgd, 1 - a_interstate)
    hmm.add_transition(s_bkgd, s_amp, a_interstate)
    hmm.add_transition(s_amp, s_bkgd, a_interstate)
    hmm.add_transition(s_amp, s_amp, 1 - a_interstate)
    hmm.bake()
    return hmm
Esempio n. 14
0
def create_hidden_MarkovModel(e_df, q_df, start_p_dict):
    """
    Creates a Hidden Markov Model based on DataFrame
    @args:
        - e_df (pd.Dataframe): contains the emission probabilites
        - q_df (pd.Dataframe): contains the emission probabilites
    """
    model = HiddenMarkovModel(name="Example Model")

    '#1: Create a dict for each key in trans. df'
    model_dict = {}
    for key in q_df.keys().values:
        model_dict[key] = {}

    '#2: Create the states'
    for key in model_dict:
        '#2.1.Step Add teh emission prob. to each state, , P(observation | state)'
        emission_p = DiscreteDistribution(e_df[key].to_dict())
        sunny_state = State(emission_p, name=key)
        model_dict[key] = State(emission_p, name=key)
        model.add_state(model_dict[key])
        '#2.2.Step: Add the start probability for each state'
        model.add_transition(model.start, model_dict[key], start_p_dict[key])

    '#3.Step: Add the transition probability to each state'
    for key, item in q_df.to_dict("index").items():
        for item_name, value in item.items():
            print(key, " , ", item_name, ": ", value)
            tmp_origin = model_dict[key]
            tmp_destination = model_dict[item_name]
            model.add_transition(tmp_origin, tmp_destination,
                                 q_df.loc[key, item_name])
    # finally, call the .bake() method to finalize the model
    model.bake()

    return model
Esempio n. 15
0
    def cluster(self):
        if self.preprocessed_data is None:
            print("No preprocessed_data attribute found")
            return -1

        if self.alg == "Kmeans":
            from sklearn.cluster import KMeans
            km = KMeans(n_clusters=self.K, precompute_distances=True)
            km.fit(np.concatenate(
                self.preprocessed_data))  #flattens all dates together
            self.states = [km.predict(d) for d in self.preprocessed_data]

        elif self.alg == "HMM":
            from pomegranate import HiddenMarkovModel, MultivariateGaussianDistribution
            distribution = MultivariateGaussianDistribution
            hmm=HiddenMarkovModel().from_samples(distribution,n_components=self.K\
            ,X=self.preprocessed_data.copy())
            self.states = [
                np.array(hmm.predict(d.copy())) for d in self.preprocessed_data
            ]
        else:
            print("Unrecognised or undefined clustering algorithm.")
            return -1
        self.experiment_progress = 2
Esempio n. 16
0
mfc_training_acc = accuracy(data.training_set.X, data.training_set.Y,
                            mfc_model)
print("training accuracy mfc_model: {:.2f}%".format(100 * mfc_training_acc))
mfc_testing_acc = accuracy(data.testing_set.X, data.testing_set.Y, mfc_model)
print("testing accuracy mfc_model: {:.2f}%".format(100 * mfc_testing_acc))

# Calculate unigram_counts with a list of tag sequences from the training set
tag_unigrams = unigram_counts(data.training_set.Y)
# Calculate bigram_counts with a list of tag sequences from the training set
tag_bigrams = bigram_counts(data.training_set.Y)
# Calculate the count of each tag starting a sequence
tag_starts = starting_counts(data.training_set.Y)
# Calculate the count of each tag ending a sequence
tag_ends = ending_counts(data.training_set.Y)

basic_model = HiddenMarkovModel(name="base-hmm-tagger")

# Create states with emission probability distributions P(word | tag) and add to the model
tag_states = {}

for tag in data.training_set.tagset:
    tag_emissions = DiscreteDistribution({
        word: emission_counts[tag][word] / tag_unigrams[tag]
        for word in emission_counts[tag]
    })
    tag_states[tag] = State(tag_emissions, name=tag)
    basic_model.add_state(tag_states[tag])

# Add edges between states for the observed transition frequencies P(tag_i | tag_i-1)
for tag in data.training_set.tagset:
    basic_model.add_transition(basic_model.start, tag_states[tag],
if len(end_tag_counts) < len(data.training_set.tagset):
    for tag in data.training_set.tagset:
        if tag not in end_tag_counts:
            end_tag_counts[tag] = 0

################## 5. COUNT NUMBER OF (TAG_i, WORD_i) PAIRS ###################
#######################  pair_counts[tag][word] = k  ##########################
pair_counts = defaultdict(lambda: defaultdict(lambda: 0))

for sentence_idx, sentence in enumerate(data.training_set.Y):
    for word_idx, tag in enumerate(sentence):
        word = data.training_set.X[sentence_idx][word_idx]
        pair_counts[tag][word] += 1

############################# 6. BUILD HMM MODEL ##############################
HMM_model = HiddenMarkovModel(name="HMM-Tagger")
tag_states = []  # state for each tag

################# (6.1) ADD STATES w/ EMISSION PROBABILITIES ##################
''' 
tag_emissions: P(word_i|tag_j) 
             = P(word_i, tag_j)/P(tag_j)
             = C((word_i, tag_j) pairs)/C(tag_j)
'''
for tag in data.training_set.tagset:
    tag_emissions = DiscreteDistribution({word:pair_counts[tag][word]/single_tag_counts[tag] \
                                          for word in data.training_set.vocab})
    tag_state = State(tag_emissions, name=tag)
    tag_states.append(tag_state)
    HMM_model.add_states(tag_state)
converted_total = [converter_to(x, 2) for x in total]

matrixDonor0 = numpy.array(matrix_from_exa('new_donor1.exa'))

c0, c1, c2 = calculator.calculate_proba2('cuts.txt')

coding_state0 = State(DiscreteDistribution(c0.p), 'coding state 0')
coding_state1 = State(DiscreteDistribution(c1.p), 'coding state 1')
coding_state2 = State(DiscreteDistribution(c2.p), 'coding state 2')

donor0_data = classify(matrixDonor0, 2)
donor0_states = sequence_state_factory(donor0_data, 'donor0')

post = State(DiscreteDistribution(equal_distribution), name='post')

model = HiddenMarkovModel('codiing to donor')

model.add_state(coding_state0)
model.add_state(coding_state1)
model.add_state(coding_state2)

add_sequence(model, donor0_states)

model.add_state(post)

model.add_transition(model.start, coding_state0, 1)

model.add_transition(coding_state0, coding_state1, 0.6)
model.add_transition(coding_state0, donor0_states[0], 0.4)

model.add_transition(coding_state1, coding_state2, 0.6)
Esempio n. 19
0
matrix_GC = numpy.array(matrix_from_fasta('gc_completo.seq'))
matrix_CCAAT = numpy.array(matrix_from_fasta('CCAAT_completa.seq'))
matrix_Inr = numpy.array(matrix_from_fasta('Inr_completo.seq'))
matrix_no_inr = numpy.array(matrix_from_fasta('no_inr.fa'))

gc_data = classify(matrix_GC, 2)
tata_data = classify(matrix_TATA, 2)
cat_data = classify(matrix_CCAAT, 2)
inr_data = classify(matrix_Inr, 2)
no_inr_data = classify(matrix_no_inr, 2)

no_coding = calculator.intron_calculator('cuts_intron.txt')


# Model
promoter_utr_model = HiddenMarkovModel('promoter')

# States
back = State(DiscreteDistribution(no_coding.p), name='back')

gc_states = sequence_state_factory(gc_data, 'GC')
post_gc_var_spacers_tss = spacer_states_maker(151, no_coding.p, 'post gc var spacer tss')
post_gc_spacers_tss = spacer_states_maker(38, no_coding.p, 'post gc spacer tss')

post_gc_var_spacers_tata = spacer_states_maker(151, no_coding.p, 'post gc var spacer tata')
post_gc_spacers_tata = spacer_states_maker(18, no_coding.p, 'post gc spacer tata')


cat_states = sequence_state_factory(cat_data, 'CAT')
post_cat_var_spacers_tss = spacer_states_maker(151, no_coding.p, 'post cat var spacer tss')
post_cat_spacers_tss = spacer_states_maker(42, no_coding.p, 'post cat spacer tss')
import numpy
from pomegranate import State
from pomegranate import DiscreteDistribution
from pomegranate import HiddenMarkovModel
import calculator
from converter_to import converter_to
from model_maker_utils import sequence_state_factory
from model_maker_utils import classify
from model_maker_utils import add_sequence
from model_maker_utils import equal_distribution
from matrix_from_aln import matrix_from_exa

matrixAcceptor0 = numpy.array(matrix_from_exa('new_acceptor1.exa'))
acceptor0_data = classify(matrixAcceptor0, 2)

model = HiddenMarkovModel('intron_acceptor')

intron = State(DiscreteDistribution(
    calculator.intron_calculator('cuts_intron.txt').p),
               name='in')
acceptor0_states = sequence_state_factory(acceptor0_data, 'acceptor0')
post = State(DiscreteDistribution(equal_distribution), name='post')

model.add_state(intron)
add_sequence(model, acceptor0_states)
model.add_state(post)

model.add_transition(model.start, intron, 1)
model.add_transition(intron, intron, 0.9)
model.add_transition(intron, acceptor0_states[0], 0.1)
model.add_transition(acceptor0_states[-1], post, 1)
Esempio n. 21
0
donor1_data = classify(matrixDonor1, 2)
donor1_states = sequence_state_factory(donor1_data, 'donor1')

donor2_data = classify(matrixDonor2, 2)
donor2_states = sequence_state_factory(donor2_data, 'donor2')

acceptor0_data = classify(matrixAcceptor0, 2)
acceptor0_states = sequence_state_factory(acceptor0_data, 'acceptor0')

acceptor1_data = classify(matrixAcceptor1, 2)
acceptor1_states = sequence_state_factory(acceptor1_data, 'acceptor1')

acceptor2_data = classify(matrixAcceptor2, 2)
acceptor2_states = sequence_state_factory(acceptor2_data, 'acceptor2')

coding_model = HiddenMarkovModel()

intron_distribution = calculator.intron_calculator('cuts_intron.txt')
back = State(DiscreteDistribution(
    calculator.intron_calculator('cuts_intron.txt').p),
             name='back')

fake_back = State(DiscreteDistribution(intron_distribution.p), name='back2')

in0 = State(DiscreteDistribution(intron_distribution.p), name='in0')
in1 = State(DiscreteDistribution(intron_distribution.p), name='in1')
in2 = State(DiscreteDistribution(intron_distribution.p), name='in2')

in0_spacers = spacer_states_maker(64, intron_distribution.p, 'in0 spacer')
in1_spacers = spacer_states_maker(64, intron_distribution.p, 'in1 spacer')
in2_spacers = spacer_states_maker(64, intron_distribution.p, 'in2 spacer')
import pomegranate


# import python modules -- this cell needs to be run again if you make changes to any of the files
import matplotlib.pyplot as plt
import numpy as np

from helpers import show_model
from pomegranate import State, HiddenMarkovModel, DiscreteDistribution





# create the HMM model
model = HiddenMarkovModel(name="Example Model")


# create the HMM model
model = HiddenMarkovModel(name="Example Model")

# emission probability distributions, P(umbrella | weather)
sunny_emissions = DiscreteDistribution({"yes": 0.1, "no": 0.9})
sunny_state = State(sunny_emissions, name="Sunny")

# TODO: create a discrete distribution for the rainy emissions from the probability table
# above & use that distribution to create a state named Rainy
rainy_emissions = DiscreteDistribution({"yes": 0.8, "no": 0.2})
rainy_state = State(rainy_emissions, name="Rainy")

# add the states to the model
Esempio n. 23
0
 def __init__(self):
     self.model = HiddenMarkovModel()
def crop_type_hmm_model(nn_pobability_matrix, timeseries_steps,
                        n_observed_classes):
    # 0               1              2            3       4        5
    [
        'unknown_plant', 'large_grass', 'small_grass', 'other', 'fallow',
        'no_crop'
    ]

    d0 = NeuralNetworkWrapperCustom(
        predicted_probabilities=nn_pobability_matrix,
        i=0,
        n_samples=timeseries_steps,
        n_classes=n_observed_classes)
    d1 = NeuralNetworkWrapperCustom(
        predicted_probabilities=nn_pobability_matrix,
        i=1,
        n_samples=timeseries_steps,
        n_classes=n_observed_classes)
    d2 = NeuralNetworkWrapperCustom(
        predicted_probabilities=nn_pobability_matrix,
        i=2,
        n_samples=timeseries_steps,
        n_classes=n_observed_classes)
    d3 = NeuralNetworkWrapperCustom(
        predicted_probabilities=nn_pobability_matrix,
        i=3,
        n_samples=timeseries_steps,
        n_classes=n_observed_classes)
    d4 = NeuralNetworkWrapperCustom(
        predicted_probabilities=nn_pobability_matrix,
        i=4,
        n_samples=timeseries_steps,
        n_classes=n_observed_classes)

    d5 = NeuralNetworkWrapperCustom(
        predicted_probabilities=nn_pobability_matrix,
        i=5,
        n_samples=timeseries_steps,
        n_classes=n_observed_classes)

    s0_unk = State(d0, name='unknown_plant')
    s1_large = State(d1, name='large_grass')
    s2_small = State(d2, name='small_grass')
    s3_other = State(d3, name='other')
    s4_fallow = State(d4, name='fallow')
    s5_none = State(d5, name='no_crop')

    model = HiddenMarkovModel()

    # Initialize each hidden state.
    # All states have an equal chance of being the starting state.
    for s in [s0_unk, s1_large, s2_small, s3_other, s4_fallow, s5_none]:
        model.add_state(s)
        model.add_transition(model.start, s, 1)

    model.add_transitions(
        s0_unk, [s0_unk, s1_large, s2_small, s3_other, s4_fallow, s5_none],
        [95., 0., 0., 0., 0., 5.])
    model.add_transitions(
        s1_large, [s0_unk, s1_large, s2_small, s3_other, s4_fallow, s5_none],
        [0., 95., 0., 0., 0., 5.])
    model.add_transitions(
        s2_small, [s0_unk, s1_large, s2_small, s3_other, s4_fallow, s5_none],
        [0., 0., 95., 0., 0., 5.])
    model.add_transitions(
        s3_other, [s0_unk, s1_large, s2_small, s3_other, s4_fallow, s5_none],
        [0., 0., 0., 95., 0., 5.])
    model.add_transitions(
        s4_fallow, [s0_unk, s1_large, s2_small, s3_other, s4_fallow, s5_none],
        [0., 0., 0., 0., 95., 5.])
    model.add_transitions(
        s5_none, [s0_unk, s1_large, s2_small, s3_other, s4_fallow, s5_none],
        [2., 2., 2., 2., 2., 90.])

    model.bake(verbose=False)

    return model
Esempio n. 25
0
def train_and_test():
    with open('../data extractors/exons_start_1.txt') as in_file:
        total = []
        for line in in_file:
            no_p_line = line.replace('P', '').lower().replace('\n', '')
            total.append(no_p_line)

    converted_total = [converter_to(x, 2) for x in total]

    matrixDonor0 = numpy.array(
        matrix_from_exa('../data extractors/new_donor1.exa'))

    c0, c1, c2 = calculator.calculate_proba2('../data extractors/new_cuts.txt')
    print(c0.p, c1.p, c2.p)
    coding_state0 = State(DiscreteDistribution(c0.p), 'coding state 0')
    coding_state1 = State(DiscreteDistribution(c1.p), 'coding state 1')
    coding_state2 = State(DiscreteDistribution(c2.p), 'coding state 2')

    donor0_data = classify(matrixDonor0, 2)
    donor0_states = sequence_state_factory(donor0_data, 'donor0')

    post = State(DiscreteDistribution(equal_distribution), name='post')

    model = HiddenMarkovModel('coding to donor')

    model.add_state(coding_state0)
    model.add_state(coding_state1)
    model.add_state(coding_state2)

    add_sequence(model, donor0_states)

    model.add_state(post)

    model.add_transition(model.start, coding_state0, 1)

    model.add_transition(coding_state0, coding_state1, 0.6)
    model.add_transition(coding_state0, donor0_states[0], 0.4)

    model.add_transition(coding_state1, coding_state2, 0.6)
    model.add_transition(coding_state1, donor0_states[0], 0.4)

    model.add_transition(coding_state2, coding_state0, 0.6)
    model.add_transition(coding_state2, donor0_states[0], 0.4)

    model.add_transition(donor0_states[-1], post, 1)

    model.add_transition(post, post, 0.9)
    model.add_transition(post, model.end, 0.1)

    model.bake()
    test_model(model)

    model.fit(converted_total,
              transition_pseudocount=1,
              emission_pseudocount=1,
              verbose=True)

    test_model(model)

    with open('partial_model_coding_to_donor_model0.json', 'w') as out:
        out.write(model.to_json())
Esempio n. 26
0
from pomegranate import DiscreteDistribution
from pomegranate import HiddenMarkovModel
import calculator
from model_maker_utils import sequence_state_factory, classify, add_sequence, equal_distribution
from matrix_from_aln import matrix_from_exa
from converter_to import converter_to

c0, c1, c2 = calculator.calculate_proba2('../data extractors/new_cuts.txt')
matrixStop = numpy.array(matrix_from_exa('../data extractors/new_stops.exa'))
coding_state0 = State(DiscreteDistribution(c0.p), 'coding state 0')
coding_state1 = State(DiscreteDistribution(c1.p), 'coding state 1')
coding_state2 = State(DiscreteDistribution(c2.p), 'coding state 2')

post = State(DiscreteDistribution(equal_distribution), name='post')

model = HiddenMarkovModel('coding_to_stop')

stop_data = classify(matrixStop, 2)
stop_states = sequence_state_factory(stop_data, 'stop')

model.add_state(coding_state0)
model.add_state(coding_state1)
model.add_state(coding_state2)

add_sequence(model, stop_states)

model.add_state(post)

model.add_transition(model.start, coding_state1, 1)
model.add_transition(coding_state0, coding_state1, 1)
model.add_transition(coding_state1, coding_state2, 1)
def crop_status_hmm_model(nn_pobability_matrix, timeseries_steps,
                          n_observed_classes):
    # 0            1       2          3          4          5
    ['emergence', 'growth', 'flowers', 'senescing', 'senesced', 'no_crop']

    d0 = NeuralNetworkWrapperCustom(
        predicted_probabilities=nn_pobability_matrix,
        i=0,
        n_samples=timeseries_steps,
        n_classes=n_observed_classes)
    d1 = NeuralNetworkWrapperCustom(
        predicted_probabilities=nn_pobability_matrix,
        i=1,
        n_samples=timeseries_steps,
        n_classes=n_observed_classes)
    d2 = NeuralNetworkWrapperCustom(
        predicted_probabilities=nn_pobability_matrix,
        i=2,
        n_samples=timeseries_steps,
        n_classes=n_observed_classes)
    d3 = NeuralNetworkWrapperCustom(
        predicted_probabilities=nn_pobability_matrix,
        i=3,
        n_samples=timeseries_steps,
        n_classes=n_observed_classes)
    d4 = NeuralNetworkWrapperCustom(
        predicted_probabilities=nn_pobability_matrix,
        i=4,
        n_samples=timeseries_steps,
        n_classes=n_observed_classes)

    d5 = NeuralNetworkWrapperCustom(
        predicted_probabilities=nn_pobability_matrix,
        i=5,
        n_samples=timeseries_steps,
        n_classes=n_observed_classes)

    s0_emerge = State(d0, name='emergence')
    s1_growth = State(d1, name='growth')
    s2_fls = State(d2, name='flowers')
    s3_sencing = State(d3, name='senescing')
    s4_senced = State(d4, name='senesced')
    s5_none = State(d5, name='no_crop')

    model = HiddenMarkovModel()

    # Initialize each hidden state.
    # All states have an equal chance of being the starting state.
    for s in [s0_emerge, s1_growth, s2_fls, s3_sencing, s4_senced, s5_none]:
        model.add_state(s)
        model.add_transition(model.start, s, 1)

    model.add_transitions(
        s0_emerge,
        [s0_emerge, s1_growth, s2_fls, s3_sencing, s4_senced, s5_none],
        [90., 5., 0., 0., 0., 5.])
    model.add_transitions(
        s1_growth,
        [s0_emerge, s1_growth, s2_fls, s3_sencing, s4_senced, s5_none],
        [0., 90., 2.5, 2.5, 0., 5.])
    model.add_transitions(
        s2_fls, [s0_emerge, s1_growth, s2_fls, s3_sencing, s4_senced, s5_none],
        [0., 0., 90., 5., 0., 5.])
    model.add_transitions(
        s3_sencing,
        [s0_emerge, s1_growth, s2_fls, s3_sencing, s4_senced, s5_none],
        [0., 0., 0., 90., 5., 5.])
    model.add_transitions(
        s4_senced,
        [s0_emerge, s1_growth, s2_fls, s3_sencing, s4_senced, s5_none],
        [0., 0., 0., 0., 90., 10.])
    model.add_transitions(
        s5_none,
        [s0_emerge, s1_growth, s2_fls, s3_sencing, s4_senced, s5_none],
        [10., 0, 0., 0., 0., 90.])

    model.bake(verbose=False)

    return model
Esempio n. 28
0
    def _segment(self, arr, components=2):

        nonzero = arr[arr > 0]
        idx = self.hampel_filter(np.log2(nonzero))
        filtered = nonzero[idx]

        log_gmm = self.get_states(np.log2(filtered))
        log_means, log_probs = log_gmm.means_.ravel(), log_gmm.weights_
        ln_gmm = self.get_states(filtered) # to improve the sensitivity
        ln_means, ln_probs = ln_gmm.means_.ravel(), ln_gmm.weights_
        if (len(log_means) == 1):
            means, probs = ln_means, ln_probs
            scale = 'linear'
        else:
            means, probs = log_means, log_probs
            scale = 'log'

        logger.info('Estimated HMM state number: {0} ({1} scale)'.format(len(means), scale))
        model = HiddenMarkovModel()
        # GMM emissions
        dists = []
        for m in means:
            tmp = []
            for i in range(components):
                e = m + (-1)**i * ((i+1)//2) * 0.5
                s = 0.5
                tmp.append(NormalDistribution(e, s))
            mixture = State(GeneralMixtureModel(tmp), name=str(m))
            dists.append(mixture)
        model.add_states(*tuple(dists))
        # transition matrix
        for i in range(len(means)):
            for j in range(len(means)):
                if i==j:
                    model.add_transition(dists[i], dists[j], 0.8)
                else:
                    model.add_transition(dists[i], dists[j], 0.2/(len(means)-1))
        
        # starts and ends
        for i in range(len(means)):
            model.add_transition(model.start, dists[i], probs[i])
        
        model.bake()

        # training sequences
        tmp = np.zeros(nonzero.size)
        tmp[idx] = filtered
        newarr = np.zeros(arr.size)
        newarr[arr > 0] = tmp

        if len(means) > 1:
            model.fit(self.pieces(newarr, scale=scale), algorithm='baum-welch', n_jobs=self.n_jobs,
                    max_iterations=5000, stop_threshold=2e-4)
            
            queue = newarr[newarr > 0]
            
            if scale=='log':
                seq = np.r_[[s.name for i, s in model.viterbi(np.log2(queue))[1][1:]]]
            else:
                seq = np.r_[[s.name for i, s in model.viterbi(queue)[1][1:]]]
            seg = self.assign_cnv(queue, seq)
            
            predicted = np.zeros(newarr.size)
            predicted[newarr > 0] = seg
            seg = self.call_intervals(predicted)
        else:
            seg = [(0, newarr.size)]
        
        return newarr, seg, scale
Esempio n. 29
0
from pathlib import Path
from xml.etree import ElementTree
from gene_ebi_to_string import to_string
from pomegranate import HiddenMarkovModel
from pomegranate import State
from pomegranate import DiscreteDistribution
from converter_to import converter_to

hmmodel = HiddenMarkovModel()

back_state = State(DiscreteDistribution({
    'a': 0.25,
    'c': 0.25,
    'g': 0.25,
    't': 0.25
}),
                   name='back')

fixed_state = State(DiscreteDistribution({
    'a': 0.45,
    'c': 0.45,
    'g': 0.05,
    't': 0.05
}),
                    name='fixed')

hmmodel.add_state(back_state)
hmmodel.add_state(fixed_state)

hmmodel.add_transition(hmmodel.start, back_state, 1)
hmmodel.add_transition(back_state, back_state, 0.9)
def dominant_cover_hmm_model(nn_pobability_matrix, timeseries_steps,
                             n_observed_classes):
    d0 = NeuralNetworkWrapperCustom(
        predicted_probabilities=nn_pobability_matrix,
        i=0,
        n_samples=timeseries_steps,
        n_classes=n_observed_classes)
    d1 = NeuralNetworkWrapperCustom(
        predicted_probabilities=nn_pobability_matrix,
        i=1,
        n_samples=timeseries_steps,
        n_classes=n_observed_classes)
    d2 = NeuralNetworkWrapperCustom(
        predicted_probabilities=nn_pobability_matrix,
        i=2,
        n_samples=timeseries_steps,
        n_classes=n_observed_classes)
    d3 = NeuralNetworkWrapperCustom(
        predicted_probabilities=nn_pobability_matrix,
        i=3,
        n_samples=timeseries_steps,
        n_classes=n_observed_classes)
    d4 = NeuralNetworkWrapperCustom(
        predicted_probabilities=nn_pobability_matrix,
        i=4,
        n_samples=timeseries_steps,
        n_classes=n_observed_classes)

    s0_veg = State(d0, name='vegetation')
    s1_residue = State(d1, name='residue')
    s2_soil = State(d2, name='soil')
    s3_snow = State(d3, name='snow')
    s4_water = State(d4, name='water')

    model = HiddenMarkovModel()

    # Initialize each hidden state.
    # All states have an equal chance of being the starting state.
    for s in [s0_veg, s1_residue, s2_soil, s3_snow, s4_water]:
        model.add_state(s)
        model.add_transition(model.start, s, 1)

    model.add_transitions(s0_veg,
                          [s0_veg, s1_residue, s2_soil, s3_snow, s4_water],
                          [95., 1.0, 1.0, 1.0, 1.0])
    model.add_transitions(s1_residue,
                          [s0_veg, s1_residue, s2_soil, s3_snow, s4_water],
                          [1.0, 95., 1.0, 1.0, 1.0])
    model.add_transitions(s2_soil,
                          [s0_veg, s1_residue, s2_soil, s3_snow, s4_water],
                          [1.0, 1.0, 95., 1.0, 1.0])
    model.add_transitions(s3_snow,
                          [s0_veg, s1_residue, s2_soil, s3_snow, s4_water],
                          [1.0, 1.0, 1.0, 95., 1.0])
    model.add_transitions(s4_water,
                          [s0_veg, s1_residue, s2_soil, s3_snow, s4_water],
                          [1.0, 1.0, 1.0, 1.0, 95.])

    model.bake(verbose=False)

    return model