from model_maker_utils import equal_distribution
from matrix_from_aln import matrix_from_exa

matrixAcceptor0 = numpy.array(matrix_from_exa('new_acceptor1.exa'))
acceptor0_data = classify(matrixAcceptor0, 2)

model = HiddenMarkovModel('intron_acceptor')

intron = State(DiscreteDistribution(
    calculator.intron_calculator('cuts_intron.txt').p),
               name='in')
acceptor0_states = sequence_state_factory(acceptor0_data, 'acceptor0')
post = State(DiscreteDistribution(equal_distribution), name='post')

model.add_state(intron)
add_sequence(model, acceptor0_states)
model.add_state(post)

model.add_transition(model.start, intron, 1)
model.add_transition(intron, intron, 0.9)
model.add_transition(intron, acceptor0_states[0], 0.1)
model.add_transition(acceptor0_states[-1], post, 1)
model.add_transition(post, post, 0.5)
model.add_transition(post, model.end, 0.5)

model.bake()
test_l = 'GTAACACTGAATACTCAGGAACAATTAATGGATGGTAACATATGAGGAATATCTAGGAGGCACACCCTCTCTGGCATCTATGATGGGCCAAAAACCCGCATTCGCTTGGCCACAGTATGTGAAATATAACCCAGCTTAGACACAGGGTGCGGCAGCTGTCATGTTTCTCTGTGTGTGCCGAGTGTCATGTCTGCACCGTACAGGGATAGCTGAGTCTTCATCCTCCTCAGCTCCTATCTGTCCAGTGCAATGAACAGCAGCTGCTCTCTTCCTCTCTGGTTCCCATGGCAGCCATGCTCTGTTGCAGAGAGAACAGGATTGCATGTTCCCTCTTAATGGGAACGTCCATTTTGCTTTCTGGGACCACTCTCTTAATGCCGCCTGTCAAAACCAGCTAGGACTCCCTGGGGTCCAATCCCTCTGTGTTTAATCTTCTGTCATCTCTGTCCCACCTGGCTCATCAGGGAGATGCAGAAGGCTGAAGAAAAGGAAGTCCCTGAGGACTCACTGGAGGAATGTGCCATCACTTGTTCAAATAGCCATGGCCCTTATGACTCCAACCATGACTCCAACC'
converted = converter_to(test_l.lower().replace(' ', '').replace('p', ''))

#logp, path = model.viterbi(converted)
#print(logp, [x[1].name + str(i) for i, x in enumerate(path)])
def train_and_test():
    with open('../data extractors/exons_start_1.txt') as in_file:
        total = []
        for line in in_file:
            no_p_line = line.replace('P', '').lower().replace('\n', '')
            total.append(no_p_line)

    converted_total = [converter_to(x, 2) for x in total]

    matrixDonor0 = numpy.array(
        matrix_from_exa('../data extractors/new_donor1.exa'))

    c0, c1, c2 = calculator.calculate_proba2('../data extractors/new_cuts.txt')
    print(c0.p, c1.p, c2.p)
    coding_state0 = State(DiscreteDistribution(c0.p), 'coding state 0')
    coding_state1 = State(DiscreteDistribution(c1.p), 'coding state 1')
    coding_state2 = State(DiscreteDistribution(c2.p), 'coding state 2')

    donor0_data = classify(matrixDonor0, 2)
    donor0_states = sequence_state_factory(donor0_data, 'donor0')

    post = State(DiscreteDistribution(equal_distribution), name='post')

    model = HiddenMarkovModel('coding to donor')

    model.add_state(coding_state0)
    model.add_state(coding_state1)
    model.add_state(coding_state2)

    add_sequence(model, donor0_states)

    model.add_state(post)

    model.add_transition(model.start, coding_state0, 1)

    model.add_transition(coding_state0, coding_state1, 0.6)
    model.add_transition(coding_state0, donor0_states[0], 0.4)

    model.add_transition(coding_state1, coding_state2, 0.6)
    model.add_transition(coding_state1, donor0_states[0], 0.4)

    model.add_transition(coding_state2, coding_state0, 0.6)
    model.add_transition(coding_state2, donor0_states[0], 0.4)

    model.add_transition(donor0_states[-1], post, 1)

    model.add_transition(post, post, 0.9)
    model.add_transition(post, model.end, 0.1)

    model.bake()
    test_model(model)

    model.fit(converted_total,
              transition_pseudocount=1,
              emission_pseudocount=1,
              verbose=True)

    test_model(model)

    with open('partial_model_coding_to_donor_model0.json', 'w') as out:
        out.write(model.to_json())
Esempio n. 3
0
coding_state0 = State(DiscreteDistribution(c0.p), 'coding state 0')
coding_state1 = State(DiscreteDistribution(c1.p), 'coding state 1')
coding_state2 = State(DiscreteDistribution(c2.p), 'coding state 2')

coding_model.add_state(back)
coding_model.add_state(fake_back)
coding_model.add_state(coding_state0)
coding_model.add_state(coding_state1)
coding_model.add_state(coding_state2)

coding_model.add_state(in0)
coding_model.add_state(in1)
coding_model.add_state(in2)

coding_model.add_state(exon3_state)
add_sequence(coding_model, poly_a_states)
add_sequence(coding_model, post_poly_spacer)

add_sequence(coding_model, in0_spacers)
add_sequence(coding_model, in1_spacers)
add_sequence(coding_model, in2_spacers)

add_sequence(coding_model, ze_states)

add_sequence(coding_model, ez_states_taa)
add_sequence(coding_model, ez_states_tga)
add_sequence(coding_model, ez_states_tag)

add_sequence(coding_model, donor0_states)
add_sequence(coding_model, donor1_states)
add_sequence(coding_model, donor2_states)
Esempio n. 4
0
coding_state0 = State(DiscreteDistribution(c0.p), 'coding state 0')
coding_state1 = State(DiscreteDistribution(c1.p), 'coding state 1')
coding_state2 = State(DiscreteDistribution(c2.p), 'coding state 2')

post = State(DiscreteDistribution(equal_distribution), name='post')

model = HiddenMarkovModel('coding_to_stop')

stop_data = classify(matrixStop, 2)
stop_states = sequence_state_factory(stop_data, 'stop')

model.add_state(coding_state0)
model.add_state(coding_state1)
model.add_state(coding_state2)

add_sequence(model, stop_states)

model.add_state(post)

model.add_transition(model.start, coding_state1, 1)
model.add_transition(coding_state0, coding_state1, 1)
model.add_transition(coding_state1, coding_state2, 1)
model.add_transition(coding_state2, coding_state0, 0.6)
model.add_transition(coding_state2, stop_states[0], 0.4)
model.add_transition(stop_states[-1], post, 1)
model.add_transition(post, post, 0.9)
model.add_transition(post, model.end, 0.1)

model.bake()

with open('../data extractors/exons_end_start_2.txt') as in_file:
Esempio n. 5
0
                not_ok += 1
        print(oks / (oks + not_ok))


back = State(DiscreteDistribution(equal_distribution), name='back')
back2 = State(DiscreteDistribution(equal_distribution), name='back2')

matrixZE = numpy.array(matrix_from_exa('../data extractors/starts.exa'))
start_states_data = classify(matrixZE, 2)
start_states = sequence_state_factory(start_states_data, 'start zone')

model = HiddenMarkovModel()

model.add_state(back)
model.add_state(back2)
add_sequence(model, start_states)

model.add_transition(model.start, back, 1)
model.add_transition(back, back, 0.55)
model.add_transition(back, start_states[0], 0.45)
model.add_transition(start_states[-1], back2, 1)
model.add_transition(back2, back2, 0.5)

model.bake()


def train_and_test():
    test(model)

    lines = []
    with open('../data extractors/train_start2.exa') as fi:
Esempio n. 6
0
tata_states = sequence_state_factory(tata_data, 'tata')
post_tata_var_spacers = spacer_states_maker(16, no_coding.p, 'post_tata_var_spacer')
post_tata_spacers = spacer_states_maker(4, no_coding.p, 'post_tata_spacer')

inr_states = sequence_state_factory(inr_data, 'inr')

no_inr_states = sequence_state_factory(no_inr_data, 'no inr')
# Add States
promoter_utr_model.add_state(back)


# Add Sequences

#GC

add_sequence(promoter_utr_model, gc_states)

add_sequence(promoter_utr_model, post_gc_spacers_tata)
add_variable_length_sequence(promoter_utr_model, post_gc_var_spacers_tata, post_gc_spacers_tata[0])

add_sequence(promoter_utr_model, post_gc_spacers_tss)
add_variable_length_sequence(promoter_utr_model, post_gc_var_spacers_tss, post_gc_spacers_tss[0])

add_sequence(promoter_utr_model, inr_states)
add_sequence(promoter_utr_model, no_inr_states)

# CAAT
add_sequence(promoter_utr_model, cat_states)

add_sequence(promoter_utr_model, post_cat_spacers_tss)
add_variable_length_sequence(promoter_utr_model, post_cat_var_spacers_tss, post_cat_spacers_tss[0])
coding_state0 = State(DiscreteDistribution(c0.p), 'coding state 0')
coding_state1 = State(DiscreteDistribution(c1.p), 'coding state 1')
coding_state2 = State(DiscreteDistribution(c2.p), 'coding state 2')

donor0_data = classify(matrixDonor0, 2)
donor0_states = sequence_state_factory(donor0_data, 'donor0')

post = State(DiscreteDistribution(equal_distribution), name='post')

model = HiddenMarkovModel('codiing to donor')

model.add_state(coding_state0)
model.add_state(coding_state1)
model.add_state(coding_state2)

add_sequence(model, donor0_states)

model.add_state(post)

model.add_transition(model.start, coding_state0, 1)

model.add_transition(coding_state0, coding_state1, 0.6)
model.add_transition(coding_state0, donor0_states[0], 0.4)

model.add_transition(coding_state1, coding_state2, 0.6)
model.add_transition(coding_state1, donor0_states[0], 0.4)

model.add_transition(coding_state2, coding_state0, 0.6)
model.add_transition(coding_state2, donor0_states[0], 0.4)

model.add_transition(donor0_states[-1], post, 1)
Esempio n. 8
0
donor_states = sequence_state_factory(donor0_data, 'donor0')
acceptor_states = sequence_state_factory(acceptor0_data, 'acceptor0')
intron_spacer_states = spacer_states_maker(10, no_coding_dist, 'intron spacer')

utr_model = HiddenMarkovModel('utr_model')

# States
exon_state = State(DiscreteDistribution(calculator.utr_exon_5('mcutsa.txt').p),
                   name='utr exon')
intron_state = State(DiscreteDistribution(no_coding_dist), name='utr intron')

utr_model.add_model(promoter_model)
utr_model.add_state(exon_state)
utr_model.add_state(intron_state)

add_sequence(utr_model, donor_states)
add_sequence(utr_model, acceptor_states)
add_sequence(utr_model, intron_spacer_states)

utr_model.add_transition(utr_model.start, get_state(promoter_model, 'back'), 1)
utr_model.add_transition(get_state(promoter_model, 'inr7'), exon_state, 1)
utr_model.add_transition(get_state(promoter_model, 'no inr7'), exon_state, 1)

utr_model.add_transition(exon_state, exon_state, 0.7)
utr_model.add_transition(exon_state, donor_states[0], 0.2)
utr_model.add_transition(exon_state, utr_model.end, 0.1)

utr_model.add_transition(donor_states[-1], intron_state, 1)

utr_model.add_transition(intron_state, intron_state, 0.5)
utr_model.add_transition(intron_state, intron_spacer_states[0], 0.5)