import numpy from pomegranate import State from pomegranate import DiscreteDistribution from pomegranate import HiddenMarkovModel import calculator from converter_to import converter_to from model_maker_utils import sequence_state_factory from model_maker_utils import classify from model_maker_utils import add_sequence from model_maker_utils import equal_distribution from matrix_from_aln import matrix_from_exa matrixAcceptor0 = numpy.array(matrix_from_exa('new_acceptor1.exa')) acceptor0_data = classify(matrixAcceptor0, 2) model = HiddenMarkovModel('intron_acceptor') intron = State(DiscreteDistribution( calculator.intron_calculator('cuts_intron.txt').p), name='in') acceptor0_states = sequence_state_factory(acceptor0_data, 'acceptor0') post = State(DiscreteDistribution(equal_distribution), name='post') model.add_state(intron) add_sequence(model, acceptor0_states) model.add_state(post) model.add_transition(model.start, intron, 1) model.add_transition(intron, intron, 0.9) model.add_transition(intron, acceptor0_states[0], 0.1) model.add_transition(acceptor0_states[-1], post, 1)
import numpy from pomegranate import State from pomegranate import DiscreteDistribution from pomegranate import HiddenMarkovModel import calculator from converter_to import converter_to from model_maker_utils import sequence_state_factory from model_maker_utils import classify from model_maker_utils import add_sequence from model_maker_utils import equal_distribution from matrix_from_aln import matrix_from_exa matrixAcceptor0 = numpy.array( matrix_from_exa('../data extractors/new_acceptor1.exa')) acceptor0_data = classify(matrixAcceptor0, 2) model = HiddenMarkovModel('intron_acceptor') intron = State(DiscreteDistribution( calculator.intron_calculator('../data extractors/new_cuts_intron.txt').p), name='in') acceptor0_states = sequence_state_factory(acceptor0_data, 'acceptor0') post = State(DiscreteDistribution(equal_distribution), name='post') model.add_state(intron) add_sequence(model, acceptor0_states) model.add_state(post) model.add_transition(model.start, intron, 1) model.add_transition(intron, intron, 0.9) model.add_transition(intron, acceptor0_states[0], 0.1)
import itertools import calculator from model_maker_utils import sequence_state_factory from model_maker_utils import classify from model_maker_utils import add_sequence from model_maker_utils import spacer_states_maker from model_maker_utils import percentage_matrix_maker from stop_example_divider import divider as stop_divider def foo(l): yield from itertools.product(*([l] * 3)) c0, c1, c2 = calculator.calculate_proba2('cuts.txt') matrixZE = numpy.array(matrix_from_exa('new_tss.exa')) # matrixEZ = numpy.array(matrix_from_exa('new_tts.exa')) taa_matrix, tga_matrix, tag_matrix = stop_divider('new_tts.exa') matrixDonor0 = numpy.array(matrix_from_exa('new_donor0.exa')) matrixDonor1 = numpy.array(matrix_from_exa('new_donor1.exa')) matrixDonor2 = numpy.array(matrix_from_exa('new_donor2.exa')) matrixAcceptor0 = numpy.array(matrix_from_exa('new_acceptor0.exa')) matrixAcceptor1 = numpy.array(matrix_from_exa('new_acceptor1.exa')) matrixAcceptor2 = numpy.array(matrix_from_exa('new_acceptor2.exa')) polyASeqs = [('AATAAA', 592), ('ATTAAA', 149), ('AGTAAA', 27), ('TATAAA', 32), ('CATAAA', 13), ('GATAAA', 13), ('AATATA', 17), ('AATACA', 12), ('AATAGA', 7), ('ACTAAA', 6), ('AAGAAA', 11), ('AATGAA', 8)]
import numpy from pomegranate import State from pomegranate import DiscreteDistribution from pomegranate import HiddenMarkovModel import calculator from model_maker_utils import sequence_state_factory, classify, add_sequence, equal_distribution from matrix_from_aln import matrix_from_exa from converter_to import converter_to c0, c1, c2 = calculator.calculate_proba2('../data extractors/new_cuts.txt') matrixStop = numpy.array(matrix_from_exa('../data extractors/new_stops.exa')) coding_state0 = State(DiscreteDistribution(c0.p), 'coding state 0') coding_state1 = State(DiscreteDistribution(c1.p), 'coding state 1') coding_state2 = State(DiscreteDistribution(c2.p), 'coding state 2') post = State(DiscreteDistribution(equal_distribution), name='post') model = HiddenMarkovModel('coding_to_stop') stop_data = classify(matrixStop, 2) stop_states = sequence_state_factory(stop_data, 'stop') model.add_state(coding_state0) model.add_state(coding_state1) model.add_state(coding_state2) add_sequence(model, stop_states) model.add_state(post) model.add_transition(model.start, coding_state1, 1)
def train_and_test(): with open('../data extractors/exons_start_1.txt') as in_file: total = [] for line in in_file: no_p_line = line.replace('P', '').lower().replace('\n', '') total.append(no_p_line) converted_total = [converter_to(x, 2) for x in total] matrixDonor0 = numpy.array( matrix_from_exa('../data extractors/new_donor1.exa')) c0, c1, c2 = calculator.calculate_proba2('../data extractors/new_cuts.txt') print(c0.p, c1.p, c2.p) coding_state0 = State(DiscreteDistribution(c0.p), 'coding state 0') coding_state1 = State(DiscreteDistribution(c1.p), 'coding state 1') coding_state2 = State(DiscreteDistribution(c2.p), 'coding state 2') donor0_data = classify(matrixDonor0, 2) donor0_states = sequence_state_factory(donor0_data, 'donor0') post = State(DiscreteDistribution(equal_distribution), name='post') model = HiddenMarkovModel('coding to donor') model.add_state(coding_state0) model.add_state(coding_state1) model.add_state(coding_state2) add_sequence(model, donor0_states) model.add_state(post) model.add_transition(model.start, coding_state0, 1) model.add_transition(coding_state0, coding_state1, 0.6) model.add_transition(coding_state0, donor0_states[0], 0.4) model.add_transition(coding_state1, coding_state2, 0.6) model.add_transition(coding_state1, donor0_states[0], 0.4) model.add_transition(coding_state2, coding_state0, 0.6) model.add_transition(coding_state2, donor0_states[0], 0.4) model.add_transition(donor0_states[-1], post, 1) model.add_transition(post, post, 0.9) model.add_transition(post, model.end, 0.1) model.bake() test_model(model) model.fit(converted_total, transition_pseudocount=1, emission_pseudocount=1, verbose=True) test_model(model) with open('partial_model_coding_to_donor_model0.json', 'w') as out: out.write(model.to_json())
import numpy from pomegranate import State from pomegranate import DiscreteDistribution from pomegranate import HiddenMarkovModel import calculator from stop_example_divider import divider as stop_divider from model_maker_utils import sequence_state_factory, classify, add_sequence, equal_distribution from matrix_from_aln import matrix_from_exa from converter_to import converter_to c0, c1, c2 = calculator.calculate_proba2('cuts.txt') matrixStop = numpy.array(matrix_from_exa('new_tts.exa')) coding_state0 = State(DiscreteDistribution(c0.p), 'coding state 0') coding_state1 = State(DiscreteDistribution(c1.p), 'coding state 1') coding_state2 = State(DiscreteDistribution(c2.p), 'coding state 2') taa_matrix, tga_matrix, tag_matrix = stop_divider('new_tts.exa') post = State(DiscreteDistribution(equal_distribution), name='post') model = HiddenMarkovModel('coding_to_stop') stop_data = classify(matrixStop, 2) stop_states = sequence_state_factory(stop_data, 'stop') model.add_state(coding_state0) model.add_state(coding_state1) model.add_state(coding_state2) add_sequence(model, stop_states)
for x_line in in_file: test_line = x_line.lower().replace('\n', '').replace(' ', '') tonight = converter_to(test_line, 2) logp, path = model.viterbi(tonight) path = [x[1].name for i, x in enumerate(path) if i < len(tonight)] if path[48] == 'start zone7': oks += 1 else: not_ok += 1 print(oks / (oks + not_ok)) back = State(DiscreteDistribution(equal_distribution), name='back') back2 = State(DiscreteDistribution(equal_distribution), name='back2') matrixZE = numpy.array(matrix_from_exa('../data extractors/starts.exa')) start_states_data = classify(matrixZE, 2) start_states = sequence_state_factory(start_states_data, 'start zone') model = HiddenMarkovModel() model.add_state(back) model.add_state(back2) add_sequence(model, start_states) model.add_transition(model.start, back, 1) model.add_transition(back, back, 0.55) model.add_transition(back, start_states[0], 0.45) model.add_transition(start_states[-1], back2, 1) model.add_transition(back2, back2, 0.5)
if path_names[-50] == 'donor04': oks += 1 else: noks += 1 print(oks / (oks + noks)) with open('exons_start_1.txt') as in_file: total = [] for line in in_file: no_p_line = line.replace('P', '').lower().replace('\n', '') total.append(no_p_line) converted_total = [converter_to(x, 2) for x in total] matrixDonor0 = numpy.array(matrix_from_exa('new_donor1.exa')) c0, c1, c2 = calculator.calculate_proba2('cuts.txt') coding_state0 = State(DiscreteDistribution(c0.p), 'coding state 0') coding_state1 = State(DiscreteDistribution(c1.p), 'coding state 1') coding_state2 = State(DiscreteDistribution(c2.p), 'coding state 2') donor0_data = classify(matrixDonor0, 2) donor0_states = sequence_state_factory(donor0_data, 'donor0') post = State(DiscreteDistribution(equal_distribution), name='post') model = HiddenMarkovModel('codiing to donor') model.add_state(coding_state0)
for x_line in in_file: test_line = x_line.lower().replace('\n', '').replace(' ', '') tonight = converter_to(test_line, 2) logp, path = model.viterbi(tonight) path = [x[1].name for i, x in enumerate(path) if i < len(tonight)] if path[48] == 'start zone7': oks += 1 else: not_ok += 1 print(oks / (oks + not_ok)) back = State(DiscreteDistribution(equal_distribution), name='back') back2 = State(DiscreteDistribution(equal_distribution), name='back2') matrixZE = numpy.array(matrix_from_exa('new_tss.exa')) start_states_data = classify(matrixZE, 2) start_states = sequence_state_mulfactory(start_states_data, 'start zone') model = HiddenMarkovModel() model.add_state(back) model.add_state(back2) add_sequence(model, start_states) model.add_transition(model.start, back, 1) model.add_transition(back, back, 0.55) model.add_transition(back, start_states[0], 0.45) model.add_transition(start_states[-1], back2, 1) model.add_transition(back2, back2, 0.5)
from model_maker_utils import add_variable_length_sequence from model_maker_utils import load_long_training_examples from model_maker_utils import get_state from converter_to import converter_to import calculator from pomegranate import State from pomegranate import HiddenMarkovModel from pomegranate import DiscreteDistribution from matrix_from_aln import matrix_from_exa with open('promoter_utr_model_base.json') as base_model_file: promoter_model_json = base_model_file.read() promoter_model = HiddenMarkovModel.from_json(promoter_model_json) matrixDonor0 = numpy.array(matrix_from_exa('new_donor0.exa')) matrixAcceptor0 = numpy.array(matrix_from_exa('new_acceptor0.exa')) donor0_data = classify(matrixDonor0, 2) acceptor0_data = classify(matrixAcceptor0, 2) no_coding_dist = calculator.intron_calculator('cuts_intron.txt').p donor_states = sequence_state_factory(donor0_data, 'donor0') acceptor_states = sequence_state_factory(acceptor0_data, 'acceptor0') intron_spacer_states = spacer_states_maker(10, no_coding_dist, 'intron spacer') utr_model = HiddenMarkovModel('utr_model') # States exon_state = State(DiscreteDistribution(calculator.utr_exon_5('mcutsa.txt').p), name='utr exon')