Beispiel #1
0
def make_main(zone, name):
    emission = {}
    total = 0
    for el in zone['column'].elements:
        if el != '-':
            if el not in emission:
                emission[el] = 2
                total += 2
            else:
                emission[el] += 1
                total += 1

    for key in emission:
        emission[key] = emission[key] / total
    # print('main', emission)
    return {
        'type':
        'main',
        'emission':
        emission,
        'zone':
        zone,
        'main_state':
        State(DiscreteDistribution(emission), name='main ' + name),
        'delete_state':
        State(None, name='none delete ' + name) if zone['delete'] else None
    }
Beispiel #2
0
def build_an_hmm_example():
    # i think the characters in each DiscreteDistribution definition, means the emission matrix for each state
    # because it says the probability of seeing each character when the system is in that state
    d1 = DiscreteDistribution({'A': 0.35, 'C': 0.20, 'G': 0.05, 'T': 0.40})
    d2 = DiscreteDistribution({'A': 0.25, 'C': 0.25, 'G': 0.25, 'T': 0.25})
    d3 = DiscreteDistribution({'A': 0.10, 'C': 0.40, 'G': 0.40, 'T': 0.10})

    s1 = State(d1, name="s1")
    s2 = State(d2, name="s2")
    s3 = State(d3, name="s3")

    model = HiddenMarkovModel('example')
    model.add_states([s1, s2, s3])
    model.add_transition(model.start, s1, 0.90)
    model.add_transition(model.start, s2, 0.10)
    model.add_transition(s1, s1, 0.80)
    model.add_transition(s1, s2, 0.20)
    model.add_transition(s2, s2, 0.90)
    model.add_transition(s2, s3, 0.10)
    model.add_transition(s3, s3, 0.70)
    model.add_transition(s3, model.end, 0.30)
    model.bake()

    for i in range(len(model.states)):
        print(model.states[i].name)
    model.plot()
    #print(model.log_probability(list('ACGACTATTCGAT')))

    #print(", ".join(state.name for i, state in model.viterbi(list('ACGACTATTCGAT'))[1]))

    print("forward:", model.forward(list('ACG')))
Beispiel #3
0
def setup_titanic():
    # Build a model of the titanic disaster
    global titanic_network, passenger, gender, tclass

    # Passengers on the Titanic either survive or perish
    passenger = DiscreteDistribution({'survive': 0.6, 'perish': 0.4})

    # Gender, given survival data
    gender = ConditionalProbabilityTable(
        [['survive', 'male', 0.0], ['survive', 'female', 1.0],
         ['perish', 'male', 1.0], ['perish', 'female', 0.0]], [passenger])

    # Class of travel, given survival data
    tclass = ConditionalProbabilityTable(
        [['survive', 'first', 0.0], ['survive', 'second', 1.0],
         ['survive', 'third', 0.0], ['perish', 'first', 1.0],
         ['perish', 'second', 0.0], ['perish', 'third', 0.0]], [passenger])

    # State objects hold both the distribution, and a high level name.
    s1 = State(passenger, name="passenger")
    s2 = State(gender, name="gender")
    s3 = State(tclass, name="class")

    # Create the Bayesian network object with a useful name
    titanic_network = BayesianNetwork("Titanic Disaster")

    # Add the three nodes to the network
    titanic_network.add_nodes(s1, s2, s3)

    # Add transitions which represent conditional dependencies, where the
    # second node is conditionally dependent on the first node (Monty is
    # dependent on both guest and prize)
    titanic_network.add_edge(s1, s2)
    titanic_network.add_edge(s1, s3)
    titanic_network.bake()
    def get_bayesnet(self):
        door_lock = DiscreteDistribution({'d1': 0.7, 'd2': 0.3})

        clock_alarm = DiscreteDistribution( { 'a1' : 0.8, 'a2' : 0.2} )

        light = ConditionalProbabilityTable(
            [[ 'd1', 'a1', 'l1', 0.96 ],
             ['d1', 'a1', 'l2', 0.04 ],
             [ 'd1', 'a2', 'l1', 0.89 ],
             [ 'd1', 'a2', 'l2', 0.11 ],
             [ 'd2', 'a1', 'l1', 0.96 ],
             [ 'd2', 'a1', 'l2', 0.04 ],
             [ 'd2', 'a2', 'l1', 0.89 ],
             [ 'd2', 'a2', 'l2', 0.11 ]], [door_lock, clock_alarm])



        coffee_maker = ConditionalProbabilityTable(
            [[ 'a1', 'c1', 0.92 ],
             [ 'a1', 'c2', 0.08 ],
             [ 'a2', 'c1', 0.03 ],
             [ 'a2', 'c2', 0.97 ]], [clock_alarm] )

        s_door_lock = State(door_lock, name="door_lock")
        s_clock_alarm = State(clock_alarm, name="clock_alarm")
        s_light = State(light, name="light")
        s_coffee_maker = State(coffee_maker, name="coffee_maker")
        network = BayesianNetwork("User_pref")
        network.add_nodes(s_door_lock, s_clock_alarm, s_light, s_coffee_maker)

        network.add_edge(s_door_lock,s_light)
        network.add_edge(s_clock_alarm,s_coffee_maker)
        network.add_edge(s_clock_alarm,s_light)
        network.bake()
        return network
Beispiel #5
0
def get_variable_number_of_repeats_matcher_hmm(patterns,
                                               copies=1,
                                               vpaths=None):
    model = get_constant_number_of_repeats_matcher_hmm(patterns, copies,
                                                       vpaths)

    start_repeats_matches = State(None, name='start_repeating_pattern_match')
    end_repeats_matches = State(None, name='end_repeating_pattern_match')
    mat = model.dense_transition_matrix()
    states = model.states
    states.append(start_repeats_matches)
    states.append(end_repeats_matches)
    states_count = len(mat)
    start_repeats_ind = states_count
    end_repeats_ind = states_count + 1
    mat = np.c_[mat, np.zeros(states_count), np.zeros(states_count)]
    mat = np.r_[mat, [np.zeros(states_count + 2)]]
    mat = np.r_[mat, [np.zeros(states_count + 2)]]

    unit_ends = []
    for i, state in enumerate(model.states):
        if state.name.startswith('unit_end'):
            unit_ends.append(i)

    first_unit_start = None
    for i in range(len(mat[model.start_index])):
        if mat[model.start_index][i] != 0:
            first_unit_start = i
    mat[model.start_index][first_unit_start] = 0.0
    mat[model.start_index][start_repeats_ind] = 1
    mat[start_repeats_ind][first_unit_start] = 1

    for unit_end in unit_ends:
        next_state = None
        for j in range(len(mat[unit_end])):
            if mat[unit_end][j] != 0:
                next_state = j
        mat[unit_end][next_state] = 0.5
        mat[unit_end][end_repeats_ind] = 0.5

    mat[end_repeats_ind][model.end_index] = 1

    starts = np.zeros(states_count + 2)
    starts[model.start_index] = 1.0
    ends = np.zeros(states_count + 2)
    ends[model.end_index] = 1.0
    state_names = [state.name for state in states]
    distributions = [state.distribution for state in states]
    name = 'Repeat Matcher HMM Model'
    new_model = Model.from_matrix(mat,
                                  distributions,
                                  starts,
                                  ends,
                                  name=name,
                                  state_names=state_names,
                                  merge=None)
    new_model.bake(merge=None)
    return new_model
Beispiel #6
0
def setup_monty():
    # Build a model of the Monty Hall Problem
    global monty_network, monty_index, prize_index, guest_index

    random.seed(0)

    # Friends emissions are completely random
    guest = DiscreteDistribution({'A': 1. / 3, 'B': 1. / 3, 'C': 1. / 3})

    # The actual prize is independent of the other distributions
    prize = DiscreteDistribution({'A': 1. / 3, 'B': 1. / 3, 'C': 1. / 3})
    # Monty is dependent on both the guest and the prize.
    monty = ConditionalProbabilityTable(
        [['A', 'A', 'A', 0.0],
         ['A', 'A', 'B', 0.5],
         ['A', 'A', 'C', 0.5],
         ['A', 'B', 'A', 0.0],
         ['A', 'B', 'B', 0.0],
         ['A', 'B', 'C', 1.0],
         ['A', 'C', 'A', 0.0],
         ['A', 'C', 'B', 1.0],
         ['A', 'C', 'C', 0.0],
         ['B', 'A', 'A', 0.0],
         ['B', 'A', 'B', 0.0],
         ['B', 'A', 'C', 1.0],
         ['B', 'B', 'A', 0.5],
         ['B', 'B', 'B', 0.0],
         ['B', 'B', 'C', 0.5],
         ['B', 'C', 'A', 1.0],
         ['B', 'C', 'B', 0.0],
         ['B', 'C', 'C', 0.0],
         ['C', 'A', 'A', 0.0],
         ['C', 'A', 'B', 1.0],
         ['C', 'A', 'C', 0.0],
         ['C', 'B', 'A', 1.0],
         ['C', 'B', 'B', 0.0],
         ['C', 'B', 'C', 0.0],
         ['C', 'C', 'A', 0.5],
         ['C', 'C', 'B', 0.5],
         ['C', 'C', 'C', 0.0]], [guest, prize])

    # Make the states
    s1 = State(guest, name="guest")
    s2 = State(prize, name="prize")
    s3 = State(monty, name="monty")

    # Make the bayes net, add the states, and the conditional dependencies.
    monty_network = BayesianNetwork("test")
    monty_network.add_nodes(s1, s2, s3)
    monty_network.add_edge(s1, s3)
    monty_network.add_edge(s2, s3)
    monty_network.bake()

    monty_index = monty_network.states.index(s3)
    prize_index = monty_network.states.index(s2)
    guest_index = monty_network.states.index(s1)
Beispiel #7
0
def hmmer2pom(hmm):
    # set up environment
    from math import exp
    from pomegranate import DiscreteDistribution,HiddenMarkovModel,State
    tags = dict(); header = 0; alphabet = None; hmmlines = list()

    # parse HMMER file
    for line in hmm.splitlines():
        l = line.strip()
        if len(l) == 0 or l[0] == '#':
            continue
        elif header == 0:
            if l.startswith('HMM') and l[3] != 'E': # beginning of actual HMM
                header = 1; alphabet = l.split()[1:]
            else:
                parts = l.strip().split()
                if parts[0] in tags:
                    if not isinstance(tags[parts[0]], list):
                        tags[parts[0]] = [tags[parts[0]]]
                    tags[parts[0]].append(' '.join(parts[1:]))
                else:
                    tags[parts[0]] = ' '.join(parts[1:])
        elif header == 1:
            header = 2
        else:
            if l.startswith('COMPO'):
                parts = l.strip().split(); tags[parts[0]] = ' '.join(parts[1:])
            else:
                hmmlines.append(l)

    # create all states
    model = HiddenMarkovModel(tags['NAME']); tmpstates = list(); K = 0
    i_emit = hmmlines[0].split(); tmpstates.append(State(DiscreteDistribution({alphabet[i] : exp(-1*float(i_emit[i])) for i in range(len(alphabet))}), name="I0")) # insertion state
    for l in range(2,len(hmmlines),3):
        m_emit,i_emit,state_trans = [hmmlines[l+i].split() for i in range(0,3)]; K = int(m_emit[0])
        tmpstates.append(State(DiscreteDistribution({alphabet[i] : exp(-1*float(m_emit[i+1])) for i in range(len(alphabet))}), name="M%d" % K)) # match state
        tmpstates.append(State(DiscreteDistribution({alphabet[i] : exp(-1*float(i_emit[i])) for i in range(len(alphabet))}), name="I%d" % K)) # insertion state
        tmpstates.append(State(None, name="D%d" % K)) # deletion state
    assert K != 0, "No match states in profile HMM"
    model.add_states(tmpstates); name2state = {state.name:state for state in tmpstates}; name2state["M0"] = model.start; name2state["M%d"%(K+1)] = model.end

    # create all transitions
    for l in range(1,len(hmmlines),3):
        k = int(l/3); parts = hmmlines[l].split()
        model.add_transition(name2state["M%d"%k], name2state["M%d"%(k+1)], exp(-1*float(parts[0])))     # 0: M_k -> M_k+1
        model.add_transition(name2state["M%d"%k], name2state["I%d"%k],     exp(-1*float(parts[1])))     # 1: M_k -> I_k
        if parts[2] != '*': # no D_k+1 in last row
            model.add_transition(name2state["M%d"%k], name2state["D%d"%(k+1)], exp(-1*float(parts[2]))) # 2: M_k -> D_k+1
        model.add_transition(name2state["I%d"%k], name2state["M%d"%(k+1)], exp(-1*float(parts[3])))     # 3: I_k -> M_k+1
        model.add_transition(name2state["I%d"%k], name2state["I%d"%k],     exp(-1*float(parts[4])))     # 4: I_k -> I_k
        if k != 0: # no D0 state
            model.add_transition(name2state["D%d"%k], name2state["M%d"%(k+1)], exp(-1*float(parts[5]))) # 5: D_k -> M_k+1
        if parts[6] != '*': # no D0 state and no D_k+1 in last row
            model.add_transition(name2state["D%d"%k], name2state["D%d"%(k+1)], exp(-1*float(parts[6]))) # 6: D_k -> D_k+1
    model.bake()
    return model.to_json()
Beispiel #8
0
def with_variations(dist, name):
    st = State(dist, name=name)
    sti = State(DiscreteDistribution({
        'a': 0.25,
        'c': 0.25,
        'g': 0.25,
        't': 0.25
    }),
                name='i_' + name)
    std = State(None, name='d_' + name)
    return st, sti, std
Beispiel #9
0
    def update_hmm(self):
        num_states = self.num_states
        start_prob = self.start_prob
        num_emissions = self.num_emissions

        hmm = HiddenMarkovModel('hmm')
        dist = [
            DiscreteDistribution(
                dict(zip(range(num_emissions), self.emissions[i])))
            for i in range(num_states)
        ]
        states = [
            State(dist[i], 's' + str(i).zfill(2)) for i in range(num_states)
        ]
        hmm.add_states(states)
        for i in range(num_states):
            s_i = states[i]
            hmm.add_transition(hmm.start, s_i, start_prob[i])
            for j in range(num_states):
                s_j = states[j]
                p = self.transitions[i, j]
                hmm.add_transition(s_i, s_j, p)

        self.hmm = hmm
        self.hmm.bake()
Beispiel #10
0
def make_insert(zone, name):
    emission = {}
    total = 0
    for column in zone['columns']:
        for el in column.elements:
            if el != '-':
                if el not in emission:
                    emission[el] = 2
                    total += 2
                else:
                    emission[el] += 1
                    total += 1
    for key in emission:
        emission[key] = emission[key] / total
    # print(emission)
    return {
        'type':
        'insert',
        'emission':
        emission,
        'zone':
        zone,
        'insert_state':
        State(DiscreteDistribution(emission), name='insert ' + name)
    }
Beispiel #11
0
def insert_delete_main_hmm(data_matrix):
    v_columns = column_clasify(data_matrix)
    v_zones = create_zones(v_columns)
    v_grouped_states = group_states(v_zones, 'test')
    v_model = HiddenMarkovModel()
    v_first_state = State(None, name='ali_start')
    v_last_state = State(None, name='ali_end')
    v_model.add_state(v_first_state)
    v_model.add_transition(v_model.start, v_first_state, 1)
    v_model.add_state(v_last_state)
    add_states(v_model, v_grouped_states)
    v_trans = calculate_transitions(v_first_state, v_last_state,
                                    v_grouped_states)
    apply_transitions(v_model, v_trans)
    v_model.bake()
    return v_model
Beispiel #12
0
    def oriHMMParams(self):
        """
        Set initial parameters for the Hidden Markov Model (HMM).
        
        Attributes
        ----------
        HMMParams : dict
            Has 3 keys: "A", state transition matrix, "B" (emission probabilities),
            specifying parameters (Means, Variances, Weights) of the mixture
            Gaussian distributions for each hidden state, and "pi", indicating
            the hidden state weights. This dict will be updated after learning
            procedure.
        """
        hmm = HiddenMarkovModel()
        # GMM emissions
        # 5 Hidden States:
        # 0--start, 1--downstream, 2--no bias, 3--upstream, 4--end
        numdists = 3  # Three-distribution Gaussian Mixtures
        var = 7.5 / (numdists - 1)
        means = [[], [], [], [], []]
        for i in range(numdists):
            means[4].append(i * 7.5 / (numdists - 1) + 2.5)
            means[3].append(i * 7.5 / (numdists - 1))
            means[2].append((i - (numdists - 1) / 2) * 7.5 / (numdists - 1))
            means[1].append(-i * 7.5 / (numdists - 1))
            means[0].append(-i * 7.5 / (numdists - 1) - 2.5)
        states = []
        for i, m in enumerate(means):
            tmp = []
            for j in m:
                tmp.append(NormalDistribution(j, var))
            mixture = GeneralMixtureModel(tmp)
            states.append(State(mixture, name=str(i)))
        hmm.add_states(*tuple(states))

        # Transmission matrix
        #A = [[0., 1., 0., 0., 0.],
        #    [0., 0.4, 0.3, 0.3, 0.],
        #    [0.05, 0., 0.5, 0.45, 0.],
        #    [0., 0., 0., 0.5, 0.5],
        #    [0.99, 0., 0.01, 0., 0.]]
        hmm.add_transition(states[0], states[1], 1)
        hmm.add_transition(states[1], states[1], 0.4)
        hmm.add_transition(states[1], states[2], 0.3)
        hmm.add_transition(states[1], states[3], 0.3)
        hmm.add_transition(states[2], states[0], 0.05)
        hmm.add_transition(states[2], states[2], 0.5)
        hmm.add_transition(states[2], states[3], 0.45)
        hmm.add_transition(states[3], states[3], 0.5)
        hmm.add_transition(states[3], states[4], 0.5)
        hmm.add_transition(states[4], states[0], 0.99)
        hmm.add_transition(states[4], states[2], 0.01)

        pi = [0.05, 0.3, 0.3, 0.3, 0.05]
        for i in range(len(states)):
            hmm.add_transition(hmm.start, states[i], pi[i])

        hmm.bake()

        return hmm
Beispiel #13
0
def sequence_state_factory(states_data, name):
    states = []
    for index, data in enumerate(states_data):
        state = State(DiscreteDistribution(data.states_distribution),
                      name=name + str(index))
        states.append(state)
    return states
Beispiel #14
0
def load_segmentation_model(modeldata):
    model = HiddenMarkovModel('model')

    states = {}
    for s in modeldata:
        if len(s['emission']) == 1:
            emission = NormalDistribution(*s['emission'][0][:2])
        else:
            weights = np.array([w for _, _, w in s['emission']])
            dists = [NormalDistribution(mu, sigma)
                     for mu, sigma, _ in s['emission']]
            emission = GeneralMixtureModel(dists, weights=weights)
        state = State(emission, name=s['name'])

        states[s['name']] = state
        model.add_state(state)
        if 'start_prob' in s:
            model.add_transition(model.start, state, s['start_prob'])

    for s in modeldata:
        current = states[s['name']]
        for nextstate, prob in s['transition']:
            model.add_transition(current, states[nextstate], prob)

    model.bake()

    return model
Beispiel #15
0
def bake_model(tags_sequence, words_sequence):
    """
    'tags' are the time-demand labels that generate the emitted demand level.
    Demand level are represented by 'words'
    """
    # rdemand
    words = [x for x in chain(*words_sequence)]
    tag_unigrams = unigram_counts(words)
    tag_bigrams = bigram_counts(words)

    # Uniform distribution for starting and ending labels
    all_labels = list(set(words))
    tag_starts = starting_counts(all_labels)
    tag_ends = ending_counts(all_labels)

    basic_model = HiddenMarkovModel(name="base-hmm-tagger")

    # Emission count
    label_train = tags_sequence
    rdemand_train = words_sequence
    emission_count = pair_counts(rdemand_train, label_train)

    # States with emission probability distributions P(word | tag)
    states = []
    for rdemand, label_dict in emission_count.items():
        dist_tag = DiscreteDistribution({
            label: cn / tag_unigrams[rdemand]
            for label, cn in label_dict.items()
        })
        states.append(State(dist_tag, name=rdemand))

    basic_model.add_states(states)
    state_names = [s.name for s in states]
    state_index = {tag: num for num, tag in enumerate(state_names)}

    # Start transition
    total_start = sum(tag_starts.values())
    for tag, cn in tag_starts.items():
        # sname = state_index[tag]
        basic_model.add_transition(basic_model.start, states[state_index[tag]],
                                   cn / total_start)

    # End transition
    total_end = sum(tag_ends.values())
    for tag, cn in tag_ends.items():
        basic_model.add_transition(states[state_index[tag]], basic_model.end,
                                   cn / total_end)

    # Edges between states for the observed transition frequencies P(tag_i | tag_i-1)
    for key, value in tag_bigrams.items():
        basic_model.add_transition(states[state_index[key[0]]],
                                   states[state_index[key[1]]],
                                   value / tag_unigrams[key[0]])

    # Finalize the model
    basic_model.bake()

    return basic_model
Beispiel #16
0
def state_sequence_from(emissions, name):
    states = []
    for index, emission in enumerate(emissions):
        distribution = DiscreteDistribution(emission)
        state_name = name + '_' + str(index)
        print('creado estado', state_name)
        state = State(distribution, name=state_name)
        states.append(state)
    return states, [1] * (len(states) - 1)
Beispiel #17
0
def init_model(start_dip, stay_state, mean_eu, sd_eu, mean_loh):

    ## define distributions
    d_eu = NormalDistribution(mean_eu, sd_eu)  ## euploid enriched at 0
    d_loh = NormalDistribution(mean_loh,
                               sd_eu)  ## loss of heterozygosity enriched at 1
    d_aneu = NormalDistribution(mean_loh / 2.0,
                                sd_eu * 1.4)  ## aneuploid enriched at 1

    ## define states
    s_eu = State(d_eu, name='EU')  ## enriched at 0
    s_loh = State(d_loh, name='LOH')  ## enriched at 1
    s_aneu = State(d_aneu, name='ANEU')  ## enriched at 1

    ## define model and pass in states
    model = HiddenMarkovModel()
    model.add_states(s_eu, s_loh, s_aneu)

    ## define transition matrix (state a, state b, probability)
    model.add_transition(model.start, s_eu, start_dip)
    model.add_transition(model.start, s_loh, 1.0 - start_dip - 0.1)
    model.add_transition(model.start, s_aneu, 0.1)

    model.add_transition(s_eu, s_eu, stay_state)
    model.add_transition(s_eu, s_loh, 1.0 - 4 * stay_state / 5 - 0.001)
    model.add_transition(s_eu, s_aneu, 1.0 - stay_state / 5 - 0.001)
    model.add_transition(s_eu, model.end, 0.002)

    model.add_transition(s_loh, s_loh, stay_state)
    model.add_transition(s_loh, s_eu, 1.0 - 4 * stay_state / 5 - 0.001)
    model.add_transition(s_loh, s_aneu, 1.0 - stay_state / 5 - 0.001)
    model.add_transition(s_loh, model.end, 0.002)

    model.add_transition(s_aneu, s_aneu, stay_state)
    model.add_transition(s_aneu, s_eu, 1.0 - stay_state / 2 - 0.001)
    model.add_transition(s_aneu, s_loh, 1.0 - stay_state / 2 - 0.001)
    model.add_transition(s_aneu, model.end, 0.002)

    ## finalize internal structure
    model.bake()
    ## only train transitions, not emissions
    model.freeze_distributions()

    return model
Beispiel #18
0
def build_the_same_model_in_test_sample_from_site_line_by_line():

    # State olds emission distribution, but not
    #transition distribution, because that's stored in the graph edges.
    s1 = State(NormalDistribution(5, 1))
    s2 = State(NormalDistribution(1, 7))
    s3 = State(NormalDistribution(8, 2))
    model = HiddenMarkovModel()
    model.add_states(s1, s2, s3)
    model.add_transition(model.start, s1, 1.0)
    model.add_transition(s1, s1, 0.7)
    model.add_transition(s1, s2, 0.3)
    model.add_transition(s2, s2, 0.8)
    model.add_transition(s2, s3, 0.2)
    model.add_transition(s3, s3, 0.9)
    model.add_transition(s3, model.end, 0.1)
    model.bake()

    model.plot()
Beispiel #19
0
def train_hmm_tagger(data):
    # HMM
    # Use the tag unigrams and bigrams calculated above to construct a hidden Markov tagger.
    #
    # - Add one state per tag
    #     - The emission distribution at each state should be estimated with the formula: $P(w|t) = \frac{C(t, w)}{C(t)}$
    # - Add an edge from the starting state `basic_model.start` to each tag
    #     - The transition probability should be estimated with the formula: $P(t|start) = \frac{C(start, t)}{C(start)}$
    # - Add an edge from each tag to the end state `basic_model.end`
    #     - The transition probability should be estimated with the formula: $P(end|t) = \frac{C(t, end)}{C(t)}$
    # - Add an edge between _every_ pair of tags
    #     - The transition probability should be estimated with the formula: $P(t_2|t_1) = \frac{C(t_1, t_2)}{C(t_1)}$
    basic_model = HiddenMarkovModel(name="base-hmm-tagger")

    state_dict = {}
    states = []
    emission_counts = pair_counts(*list(zip(
        *data.training_set.stream()))[::-1])
    for tag in emission_counts.keys():
        tag_count = tag_unigrams[tag]
        probs = {}
        for w in emission_counts[tag]:
            probs[w] = emission_counts[tag][w] / tag_count
        emission_p = DiscreteDistribution(probs)
        state = State(emission_p, name="" + tag)
        basic_model.add_state(state)
        state_dict[tag] = state

    for tag in tag_starts:
        basic_model.add_transition(basic_model.start, state_dict[tag],
                                   tag_starts[tag] / len(data.training_set.Y))
        basic_model.add_transition(state_dict[tag], basic_model.end,
                                   tag_ends[tag] / tag_unigrams[tag])

    for (tag1, tag2) in tag_bigrams:
        basic_model.add_transition(
            state_dict[tag1], state_dict[tag2],
            tag_bigrams[(tag1, tag2)] / tag_unigrams[tag1])

    # finalize the model
    basic_model.bake()

    assert all(
        tag in set(s.name for s in basic_model.states)
        for tag in data.training_set.tagset
    ), "Every state in your network should use the name of the associated tag, which must be one of the training set tags."
    assert basic_model.edge_count() == 168, (
        "Your network should have an edge from the start node to each state, one edge between every "
        +
        "pair of tags (states), and an edge from each state to the end node.")
    HTML(
        '<div class="alert alert-block alert-success">Your HMM network topology looks good!</div>'
    )
    return basic_model
Beispiel #20
0
def buildHmm(minAmpliconLength, maxGap, windowSize):
    b_bkgd_1 = 0.1
    a_interstate = b_bkgd_1**(2 * minAmpliconLength / windowSize)
    b_amp_0 = (a_interstate)**(0.5 * windowSize / maxGap)
    b_amp_1 = 1 - b_amp_0
    b_bkgd_0 = 1 - b_bkgd_1
    bkgdDist = DiscreteDistribution({0: b_bkgd_0, 1: b_bkgd_1})
    ampDist = DiscreteDistribution({0: b_amp_0, 1: b_amp_1})
    s_bkgd = State(bkgdDist, name='background')
    s_amp = State(ampDist, name='amplicon')
    hmm = HiddenMarkovModel()
    hmm.add_states(s_bkgd, s_amp)
    hmm.add_transition(hmm.start, s_bkgd, 1 - a_interstate)
    hmm.add_transition(hmm.start, s_amp, a_interstate)
    hmm.add_transition(s_bkgd, s_bkgd, 1 - a_interstate)
    hmm.add_transition(s_bkgd, s_amp, a_interstate)
    hmm.add_transition(s_amp, s_bkgd, a_interstate)
    hmm.add_transition(s_amp, s_amp, 1 - a_interstate)
    hmm.bake()
    return hmm
Beispiel #21
0
def build_net(cpts):
    states = dict()
    for name, cpt in cpts.items():
        states[name] = State(cpt, name=name)

    model = BayesianNetwork('Poker Game')
    model.add_states(*list(states.values()))

    for name, parents, _ in sheets:
        for parent in parents:
            print(states[parent])
            model.add_transition(states[parent], states[name])

    model.bake()
    return model
Beispiel #22
0
def create_hidden_MarkovModel(e_df, q_df, start_p_dict):
    """
    Creates a Hidden Markov Model based on DataFrame
    @args:
        - e_df (pd.Dataframe): contains the emission probabilites
        - q_df (pd.Dataframe): contains the emission probabilites
    """
    model = HiddenMarkovModel(name="Example Model")

    '#1: Create a dict for each key in trans. df'
    model_dict = {}
    for key in q_df.keys().values:
        model_dict[key] = {}

    '#2: Create the states'
    for key in model_dict:
        '#2.1.Step Add teh emission prob. to each state, , P(observation | state)'
        emission_p = DiscreteDistribution(e_df[key].to_dict())
        sunny_state = State(emission_p, name=key)
        model_dict[key] = State(emission_p, name=key)
        model.add_state(model_dict[key])
        '#2.2.Step: Add the start probability for each state'
        model.add_transition(model.start, model_dict[key], start_p_dict[key])

    '#3.Step: Add the transition probability to each state'
    for key, item in q_df.to_dict("index").items():
        for item_name, value in item.items():
            print(key, " , ", item_name, ": ", value)
            tmp_origin = model_dict[key]
            tmp_destination = model_dict[item_name]
            model.add_transition(tmp_origin, tmp_destination,
                                 q_df.loc[key, item_name])
    # finally, call the .bake() method to finalize the model
    model.bake()

    return model
Beispiel #23
0
def ghmm_model(states_labels: tuple,
               transitions: tuple,
               init_prob: tuple,
               end_prob: tuple,
               means: list,
               vars: list) -> HiddenMarkovModel:
    """

    :param states_labels:
    :param transitions:
    :param init_prob:
    :param end_prob:
    :param means:
    :param vars:
    :return:
    """
    hmm_model = HiddenMarkovModel()

    mix_num = len(vars[0])
    states = []
    for state_i, state in enumerate(states_labels):
        mixture = []
        for mix_i in range(mix_num):
            init_mean = means[state_i][mix_i]
            init_var = vars[state_i][mix_i]
            mixture.append(NormalDistribution(init_mean, init_var))
        states.append(State(GeneralMixtureModel(mixture), name=str(state_i)))
    hmm_model.add_states(*tuple(states))

    for row in range(len(states_labels)):
        for col in range(len(states_labels)):
            prob = transitions[row][col]
            if prob != 0.:
                hmm_model.add_transition(states[row], states[col], prob)
    for state_i, prob in enumerate(init_prob):
        if prob != 0.:
            hmm_model.add_transition(hmm_model.start, states[state_i], prob)
    for state_i, prob in enumerate(end_prob):
        if prob != 0.:
            hmm_model.add_transition(states[state_i], hmm_model.end, prob)

    hmm_model.bake()

    return hmm_model
def emission_state_list(training_word_prob_path, training_all_word_path):
    if not os.path.exists(training_word_prob_path):
        word_by_tag(training_all_word_path, training_word_prob_path)

    df = pd.read_csv(training_word_prob_path)
    df.drop(columns=['Unnamed: 0', 'SUM'], inplace=True)
    distinct_types = [col for col in df.columns if col not in ['Word', 'SUM']]

    unigram_word_hash = {}
    """
    Split df based on distinct tags
    """
    for d in distinct_types:
        sample_df = df[df[d] > 0]
        unigram_word_hash[d] = sample_df[['Word', d]]
    """
    P W|T V word in tag
    """
    for k in unigram_word_hash:
        subset = unigram_word_hash[k]
        subset[k] = subset[k] / subset[k].sum()
        unigram_word_hash[k] = subset
    """
    Assert probability distribution == 1
    """
    for k in unigram_word_hash:
        subset = unigram_word_hash[k]
        print(f'{k} distribution sum => {subset[k].sum()}')
    """
    Create discrete distribution objects
    """
    for k in unigram_word_hash:
        dist = unigram_word_hash[k]
        print(k)
        dist_dict = dist.set_index("Word").T.to_dict("Records")[0]
        if k == 'NOUN':
            print(dist_dict['time'])
        discrete_dist = DiscreteDistribution(dist_dict)
        unigram_word_hash[k] = State(discrete_dist, name=k)

    return unigram_word_hash
Beispiel #25
0
# Calculate the count of each tag starting a sequence
tag_starts = starting_counts(data.training_set.Y)
# Calculate the count of each tag ending a sequence
tag_ends = ending_counts(data.training_set.Y)

basic_model = HiddenMarkovModel(name="base-hmm-tagger")

# Create states with emission probability distributions P(word | tag) and add to the model
tag_states = {}

for tag in data.training_set.tagset:
    tag_emissions = DiscreteDistribution({
        word: emission_counts[tag][word] / tag_unigrams[tag]
        for word in emission_counts[tag]
    })
    tag_states[tag] = State(tag_emissions, name=tag)
    basic_model.add_state(tag_states[tag])

# Add edges between states for the observed transition frequencies P(tag_i | tag_i-1)
for tag in data.training_set.tagset:
    basic_model.add_transition(basic_model.start, tag_states[tag],
                               tag_starts[tag] / tag_unigrams[tag])
    for tag1 in data.training_set.tagset:
        basic_model.add_transition(
            tag_states[tag], tag_states[tag1],
            tag_bigrams[(tag, tag1)] / tag_unigrams[tag])
    basic_model.add_transition(tag_states[tag], basic_model.end,
                               tag_ends[tag] / tag_unigrams[tag])

# finalize the model
basic_model.bake()
Beispiel #26
0
def build_reference_repeat_finder_hmm(patterns, copies=1):
    pattern = patterns[0]
    model = Model(name="HMM Model")
    insert_distribution = DiscreteDistribution({
        'A': 0.25,
        'C': 0.25,
        'G': 0.25,
        'T': 0.25
    })

    last_end = None
    start_random_matches = State(insert_distribution,
                                 name='start_random_matches')
    end_random_matches = State(insert_distribution, name='end_random_matches')
    model.add_states([start_random_matches, end_random_matches])
    for repeat in range(copies):
        insert_states = []
        match_states = []
        delete_states = []
        for i in range(len(pattern) + 1):
            insert_states.append(
                State(insert_distribution, name='I%s_%s' % (i, repeat)))

        for i in range(len(pattern)):
            distribution_map = dict({
                'A': 0.01,
                'C': 0.01,
                'G': 0.01,
                'T': 0.01
            })
            distribution_map[pattern[i]] = 0.97
            match_states.append(
                State(DiscreteDistribution(distribution_map),
                      name='M%s_%s' % (str(i + 1), repeat)))

        for i in range(len(pattern)):
            delete_states.append(
                State(None, name='D%s_%s' % (str(i + 1), repeat)))

        unit_start = State(None, name='unit_start_%s' % repeat)
        unit_end = State(None, name='unit_end_%s' % repeat)
        model.add_states(insert_states + match_states + delete_states +
                         [unit_start, unit_end])
        last = len(delete_states) - 1

        if repeat > 0:
            model.add_transition(last_end, unit_start, 0.5)
        else:
            model.add_transition(model.start, unit_start, 0.5)
            model.add_transition(model.start, start_random_matches, 0.5)
            model.add_transition(start_random_matches, unit_start, 0.5)
            model.add_transition(start_random_matches, start_random_matches,
                                 0.5)

        model.add_transition(unit_end, end_random_matches, 0.5)
        if repeat == copies - 1:
            model.add_transition(unit_end, model.end, 0.5)
            model.add_transition(end_random_matches, end_random_matches, 0.5)
            model.add_transition(end_random_matches, model.end, 0.5)

        model.add_transition(unit_start, match_states[0], 0.98)
        model.add_transition(unit_start, delete_states[0], 0.01)
        model.add_transition(unit_start, insert_states[0], 0.01)

        model.add_transition(insert_states[0], insert_states[0], 0.01)
        model.add_transition(insert_states[0], delete_states[0], 0.01)
        model.add_transition(insert_states[0], match_states[0], 0.98)

        model.add_transition(delete_states[last], unit_end, 0.99)
        model.add_transition(delete_states[last], insert_states[last + 1],
                             0.01)

        model.add_transition(match_states[last], unit_end, 0.99)
        model.add_transition(match_states[last], insert_states[last + 1], 0.01)

        model.add_transition(insert_states[last + 1], insert_states[last + 1],
                             0.01)
        model.add_transition(insert_states[last + 1], unit_end, 0.99)

        for i in range(0, len(pattern)):
            model.add_transition(match_states[i], insert_states[i + 1], 0.01)
            model.add_transition(delete_states[i], insert_states[i + 1], 0.01)
            model.add_transition(insert_states[i + 1], insert_states[i + 1],
                                 0.01)
            if i < len(pattern) - 1:
                model.add_transition(insert_states[i + 1], match_states[i + 1],
                                     0.98)
                model.add_transition(insert_states[i + 1],
                                     delete_states[i + 1], 0.01)

                model.add_transition(match_states[i], match_states[i + 1],
                                     0.98)
                model.add_transition(match_states[i], delete_states[i + 1],
                                     0.01)

                model.add_transition(delete_states[i], delete_states[i + 1],
                                     0.01)
                model.add_transition(delete_states[i], match_states[i + 1],
                                     0.98)

        last_end = unit_end

    model.bake()
    if len(patterns) > 1:
        # model.fit(patterns, algorithm='baum-welch', transition_pseudocount=1, use_pseudocount=True)
        fit_patterns = [pattern * copies for pattern in patterns]
        model.fit(fit_patterns,
                  algorithm='viterbi',
                  transition_pseudocount=1,
                  use_pseudocount=True)

    return model
Beispiel #27
0
def get_constant_number_of_repeats_matcher_hmm(patterns, copies):
    model = Model(name="Repeating Pattern Matcher HMM Model")

    transitions, emissions = build_profile_hmm_for_repeats(
        patterns, settings.MAX_ERROR_RATE)
    matches = [m for m in emissions.keys() if m.startswith('M')]

    last_end = None
    for repeat in range(copies):
        insert_states = []
        match_states = []
        delete_states = []
        for i in range(len(matches) + 1):
            insert_distribution = DiscreteDistribution(emissions['I%s' % i])
            insert_states.append(
                State(insert_distribution, name='I%s_%s' % (i, repeat)))

        for i in range(1, len(matches) + 1):
            match_distribution = DiscreteDistribution(emissions['M%s' % i])
            match_states.append(
                State(match_distribution, name='M%s_%s' % (str(i), repeat)))

        for i in range(1, len(matches) + 1):
            delete_states.append(State(None, name='D%s_%s' % (str(i), repeat)))

        unit_start = State(None, name='unit_start_%s' % repeat)
        unit_end = State(None, name='unit_end_%s' % repeat)
        model.add_states(insert_states + match_states + delete_states +
                         [unit_start, unit_end])
        n = len(delete_states) - 1

        if repeat > 0:
            model.add_transition(last_end, unit_start, 1)
        else:
            model.add_transition(model.start, unit_start, 1)

        if repeat == copies - 1:
            model.add_transition(unit_end, model.end, 1)

        model.add_transition(unit_start, match_states[0],
                             transitions['unit_start']['M1'])
        model.add_transition(unit_start, delete_states[0],
                             transitions['unit_start']['D1'])
        model.add_transition(unit_start, insert_states[0],
                             transitions['unit_start']['I0'])

        model.add_transition(insert_states[0], insert_states[0],
                             transitions['I0']['I0'])
        model.add_transition(insert_states[0], delete_states[0],
                             transitions['I0']['D1'])
        model.add_transition(insert_states[0], match_states[0],
                             transitions['I0']['M1'])

        model.add_transition(delete_states[n], unit_end,
                             transitions['D%s' % (n + 1)]['unit_end'])
        model.add_transition(delete_states[n], insert_states[n + 1],
                             transitions['D%s' % (n + 1)]['I%s' % (n + 1)])

        model.add_transition(match_states[n], unit_end,
                             transitions['M%s' % (n + 1)]['unit_end'])
        model.add_transition(match_states[n], insert_states[n + 1],
                             transitions['M%s' % (n + 1)]['I%s' % (n + 1)])

        model.add_transition(insert_states[n + 1], insert_states[n + 1],
                             transitions['I%s' % (n + 1)]['I%s' % (n + 1)])
        model.add_transition(insert_states[n + 1], unit_end,
                             transitions['I%s' % (n + 1)]['unit_end'])

        for i in range(1, len(matches) + 1):
            model.add_transition(match_states[i - 1], insert_states[i],
                                 transitions['M%s' % i]['I%s' % i])
            model.add_transition(delete_states[i - 1], insert_states[i],
                                 transitions['D%s' % i]['I%s' % i])
            model.add_transition(insert_states[i], insert_states[i],
                                 transitions['I%s' % i]['I%s' % i])
            if i < len(matches):
                model.add_transition(insert_states[i], match_states[i],
                                     transitions['I%s' % i]['M%s' % (i + 1)])
                model.add_transition(insert_states[i], delete_states[i],
                                     transitions['I%s' % i]['D%s' % (i + 1)])

                model.add_transition(match_states[i - 1], match_states[i],
                                     transitions['M%s' % i]['M%s' % (i + 1)])
                model.add_transition(match_states[i - 1], delete_states[i],
                                     transitions['M%s' % i]['D%s' % (i + 1)])

                model.add_transition(delete_states[i - 1], match_states[i],
                                     transitions['D%s' % i]['M%s' % (i + 1)])
                model.add_transition(delete_states[i - 1], delete_states[i],
                                     transitions['D%s' % i]['D%s' % (i + 1)])

        last_end = unit_end

    model.bake(merge=None)
    return model
Beispiel #28
0
def get_suffix_matcher_hmm(pattern):
    model = Model(name="Suffix Matcher HMM Model")
    insert_distribution = DiscreteDistribution({
        'A': 0.25,
        'C': 0.25,
        'G': 0.25,
        'T': 0.25
    })
    insert_states = []
    match_states = []
    delete_states = []
    hmm_name = 'suffix'
    for i in range(len(pattern) + 1):
        insert_states.append(
            State(insert_distribution, name='I%s_%s' % (i, hmm_name)))

    for i in range(len(pattern)):
        distribution_map = dict({'A': 0.01, 'C': 0.01, 'G': 0.01, 'T': 0.01})
        distribution_map[pattern[i]] = 0.97
        match_states.append(
            State(DiscreteDistribution(distribution_map),
                  name='M%s_%s' % (str(i + 1), hmm_name)))

    for i in range(len(pattern)):
        delete_states.append(
            State(None, name='D%s_%s' % (str(i + 1), hmm_name)))

    unit_start = State(None, name='suffix_start_%s' % hmm_name)
    unit_end = State(None, name='suffix_end_%s' % hmm_name)
    model.add_states(insert_states + match_states + delete_states +
                     [unit_start, unit_end])
    last = len(delete_states) - 1

    model.add_transition(model.start, unit_start, 1)

    model.add_transition(unit_end, model.end, 1)

    model.add_transition(unit_start, delete_states[0], 0.01)
    model.add_transition(unit_start, insert_states[0], 0.01)
    for i in range(len(pattern)):
        model.add_transition(unit_start, match_states[i], 0.98 / len(pattern))

    model.add_transition(insert_states[0], insert_states[0], 0.01)
    model.add_transition(insert_states[0], delete_states[0], 0.01)
    model.add_transition(insert_states[0], match_states[0], 0.98)

    model.add_transition(delete_states[last], unit_end, 0.99)
    model.add_transition(delete_states[last], insert_states[last + 1], 0.01)

    model.add_transition(match_states[last], unit_end, 0.99)
    model.add_transition(match_states[last], insert_states[last + 1], 0.01)

    model.add_transition(insert_states[last + 1], insert_states[last + 1],
                         0.01)
    model.add_transition(insert_states[last + 1], unit_end, 0.99)

    for i in range(0, len(pattern)):
        model.add_transition(match_states[i], insert_states[i + 1], 0.01)
        model.add_transition(delete_states[i], insert_states[i + 1], 0.01)
        model.add_transition(insert_states[i + 1], insert_states[i + 1], 0.01)
        if i < len(pattern) - 1:
            model.add_transition(insert_states[i + 1], match_states[i + 1],
                                 0.98)
            model.add_transition(insert_states[i + 1], delete_states[i + 1],
                                 0.01)

            model.add_transition(match_states[i], match_states[i + 1], 0.98)
            model.add_transition(match_states[i], delete_states[i + 1], 0.01)

            model.add_transition(delete_states[i], delete_states[i + 1], 0.01)
            model.add_transition(delete_states[i], match_states[i + 1], 0.98)

    model.bake(merge=None)

    return model
Beispiel #29
0
def setup_huge_monty():
    # Build the huge monty hall huge_monty_network. This is an example I made
    # up with which may not exactly flow logically, but tests a varied type of
    # tables ensures heterogeneous types of data work together.
    global huge_monty_network, huge_monty_friend, huge_monty_guest, huge_monty
    global huge_monty_remaining, huge_monty_randomize, huge_monty_prize

    # Huge_Monty_Friend
    huge_monty_friend = DiscreteDistribution({True: 0.5, False: 0.5})

    # Huge_Monty_Guest emisisons are completely random
    huge_monty_guest = ConditionalProbabilityTable(
        [[True, 'A', 0.50],
         [True, 'B', 0.25],
         [True, 'C', 0.25],
         [False, 'A', 0.0],
         [False, 'B', 0.7],
         [False, 'C', 0.3]], [huge_monty_friend])

    # Number of huge_monty_remaining cars
    huge_monty_remaining = DiscreteDistribution({0: 0.1, 1: 0.7, 2: 0.2, })

    # Whether they huge_monty_randomize is dependent on the numnber of
    # huge_monty_remaining cars
    huge_monty_randomize = ConditionalProbabilityTable(
        [[0, True, 0.05],
         [0, False, 0.95],
         [1, True, 0.8],
         [1, False, 0.2],
         [2, True, 0.5],
         [2, False, 0.5]], [huge_monty_remaining])

    # Where the huge_monty_prize is depends on if they huge_monty_randomize or
    # not and also the huge_monty_guests huge_monty_friend
    huge_monty_prize = ConditionalProbabilityTable(
        [[True, True, 'A', 0.3],
         [True, True, 'B', 0.4],
         [True, True, 'C', 0.3],
         [True, False, 'A', 0.2],
         [True, False, 'B', 0.4],
         [True, False, 'C', 0.4],
         [False, True, 'A', 0.1],
         [False, True, 'B', 0.9],
         [False, True, 'C', 0.0],
         [False, False, 'A', 0.0],
         [False, False, 'B', 0.4],
         [False, False, 'C', 0.6]], [huge_monty_randomize, huge_monty_friend])

    # Monty is dependent on both the huge_monty_guest and the huge_monty_prize.
    huge_monty = ConditionalProbabilityTable(
        [['A', 'A', 'A', 0.0],
         ['A', 'A', 'B', 0.5],
         ['A', 'A', 'C', 0.5],
         ['A', 'B', 'A', 0.0],
         ['A', 'B', 'B', 0.0],
         ['A', 'B', 'C', 1.0],
         ['A', 'C', 'A', 0.0],
         ['A', 'C', 'B', 1.0],
         ['A', 'C', 'C', 0.0],
         ['B', 'A', 'A', 0.0],
         ['B', 'A', 'B', 0.0],
         ['B', 'A', 'C', 1.0],
         ['B', 'B', 'A', 0.5],
         ['B', 'B', 'B', 0.0],
         ['B', 'B', 'C', 0.5],
         ['B', 'C', 'A', 1.0],
         ['B', 'C', 'B', 0.0],
         ['B', 'C', 'C', 0.0],
         ['C', 'A', 'A', 0.0],
         ['C', 'A', 'B', 1.0],
         ['C', 'A', 'C', 0.0],
         ['C', 'B', 'A', 1.0],
         ['C', 'B', 'B', 0.0],
         ['C', 'B', 'C', 0.0],
         ['C', 'C', 'A', 0.5],
         ['C', 'C', 'B', 0.5],
         ['C', 'C', 'C', 0.0]], [huge_monty_guest, huge_monty_prize])

    # Make the states
    s0 = State(huge_monty_friend, name="huge_monty_friend")
    s1 = State(huge_monty_guest, name="huge_monty_guest")
    s2 = State(huge_monty_prize, name="huge_monty_prize")
    s3 = State(huge_monty, name="huge_monty")
    s4 = State(huge_monty_remaining, name="huge_monty_remaining")
    s5 = State(huge_monty_randomize, name="huge_monty_randomize")

    # Make the bayes net, add the states, and the conditional dependencies.
    huge_monty_network = BayesianNetwork("test")
    huge_monty_network.add_nodes(s0, s1, s2, s3, s4, s5)
    huge_monty_network.add_transition(s0, s1)
    huge_monty_network.add_transition(s1, s3)
    huge_monty_network.add_transition(s2, s3)
    huge_monty_network.add_transition(s4, s5)
    huge_monty_network.add_transition(s5, s2)
    huge_monty_network.add_transition(s0, s2)
    huge_monty_network.bake()
Beispiel #30
0
def spacer_states_maker(quantity, distribution, name):
    states = []
    for i in range(0, quantity):
        state = State(DiscreteDistribution(distribution), name=name + str(i))
        states.append(state)
    return states