def test_io(tmpdir, nmm_example): alphabet = nmm_example["alphabet"] hmm = nmm_example["hmm"] dp = nmm_example["dp"] dp_task = DPTask.create(dp) seq = Sequence.create(b"AUGAUU", alphabet) dp_task.setup(seq) r = dp.viterbi(dp_task) assert_allclose(hmm.loglikelihood(r.sequence, r.path), -7.069201008427531) filepath = Path(tmpdir / "prof.nmm") with Output.create(bytes(filepath)) as output: prof = Profile.create(alphabet) prof.append_model(Model.create(hmm, dp)) output.write(prof) output.write(prof) output.write(prof) with Input.create(bytes(filepath)) as input: nmodels = 0 for prof in input: alphabet = prof.alphabet model = prof.models[0] dp_task = DPTask.create(model.dp) seq = Sequence.create(b"AUGAUU", alphabet) dp_task.setup(seq) r = model.dp.viterbi(dp_task) hmm = model.hmm score = hmm.loglikelihood(r.sequence, r.path) assert_allclose(score, -7.069201008427531) nmodels += 1 assert nmodels == 3
def test_codon_state(): base = BaseAlphabet.create(b"ACGU", b"X") codonp = CodonLprob.create(base) codonp.set_lprob(Codon.create(b"AUG", base), log(0.8)) codonp.set_lprob(Codon.create(b"AUU", base), log(0.1)) state = CodonState.create(b"M1", codonp) assert state.name == b"M1" assert_allclose(state.lprob(Sequence.create(b"AUG", base)), log(0.8)) assert_allclose(state.lprob(Sequence.create(b"AUU", base)), log(0.1)) assert_allclose(state.lprob(Sequence.create(b"ACU", base)), -inf)
def test_sequence_table(): alphabet = Alphabet.create(b"ACGT", b"X") seqt = SequenceTable.create(alphabet) with pytest.raises(RuntimeError): seqt.normalize() seqt.add(Sequence.create(b"AGTG", alphabet), log(0.2)) seqt.add(Sequence.create(b"T", alphabet), log(1.2)) assert_allclose(seqt.lprob(Sequence.create(b"AGTG", alphabet)), log(0.2)) assert_allclose(seqt.lprob(Sequence.create(b"T", alphabet)), log(1.2)) assert lprob_is_zero(seqt.lprob(Sequence.create(b"", alphabet))) with pytest.raises(RuntimeError): seqt.lprob(Sequence.create(b"AT", Alphabet.create(b"AT", b"X"))) with pytest.raises(RuntimeError): seqt.add(Sequence.create(b"AT", Alphabet.create(b"AT", b"X")), log(0.2)) seqt.normalize() assert_allclose(seqt.lprob(Sequence.create(b"AGTG", alphabet)), log(0.2 / 1.4)) assert_allclose( seqt.lprob(Sequence.create(b"T", alphabet)), log(1.2 / 1.4), rtol=1e-6 )
def test_mute_state(): alphabet = Alphabet.create(b"ACGU", b"X") state = MuteState.create(b"S", alphabet) assert state.name == b"S" assert_allclose(state.lprob(Sequence.create(b"", alphabet)), log(1.0)) assert lprob_is_zero(state.lprob(Sequence.create(b"AC", alphabet))) assert state.min_seq == 0 assert state.max_seq == 0 assert str(state) == "S" assert repr(state) == "<MuteState:S>"
def test_table_state(): alphabet = Alphabet.create(b"ACGU", b"X") seqt = SequenceTable.create(alphabet) seqt.add(Sequence.create(b"AUG", alphabet), log(0.8)) seqt.add(Sequence.create(b"AUU", alphabet), log(0.4)) state = TableState.create(b"M2", seqt) assert state.name == b"M2" assert_allclose(state.lprob(Sequence.create(b"AUG", alphabet)), log(0.8)) assert_allclose(state.lprob(Sequence.create(b"AUU", alphabet)), log(0.4)) assert str(state) == "M2" assert repr(state) == "<TableState:M2>"
def test_io(tmpdir, imm_example): alphabet = imm_example["alphabet"] hmm = imm_example["hmm"] dp = imm_example["dp"] dp_task = DPTask.create(dp) dp_task.setup(Sequence.create(b"AC", alphabet)) r = dp.viterbi(dp_task) assert_allclose(hmm.loglikelihood(r.sequence, r.path), log(0.48)) filepath = Path(tmpdir / "tmp.imm") with Output.create(bytes(filepath)) as output: prof = Profile.create(alphabet) prof.append_model(Model.create(hmm, dp)) output.write(prof) prof = Profile.create(alphabet) prof.append_model(Model.create(hmm, dp)) output.write(prof) prof = Profile.create(alphabet) prof.append_model(Model.create(hmm, dp)) output.write(prof) input = Input.create(bytes(filepath)) nmodels = 0 for prof in input: model = prof.models[0] dp_task = DPTask.create(model.dp) dp_task.setup(Sequence.create(b"AC", model.hmm.alphabet)) r = model.dp.viterbi(dp_task) hmm = model.hmm assert_allclose(hmm.loglikelihood(r.sequence, r.path), log(0.48)) nmodels += 1 input.close() assert nmodels == 3 with Input.create(bytes(filepath)) as input: nmodels = 0 for prof in input: model = prof.models[0] dp_task = DPTask.create(model.dp) dp_task.setup(Sequence.create(b"AC", model.hmm.alphabet)) r = model.dp.viterbi(dp_task) hmm = model.hmm assert_allclose(hmm.loglikelihood(r.sequence, r.path), log(0.48)) nmodels += 1 assert nmodels == 3
def test_fragment(): alphabet = BaseAlphabet.create(b"ACGT", b"X") seq = Sequence.create(b"ACAAAGATX", alphabet) S = MuteState.create(b"S", alphabet) E = MuteState.create(b"E", alphabet) M1 = NormalState.create( b"M1", alphabet, [log(0.8), log(0.2), log(0.01), log(0.01)], ) M2 = NormalState.create( b"M2", alphabet, [log(0.4), log(0.6), log(0.1), log(0.6)]) path = Path.create([ Step.create(S, 0), Step.create(M1, 1), Step.create(M2, 1), Step.create(E, 0) ]) fragment = Fragment(seq, path) i = iter(fragment) frag_step = next(i) assert bytes(frag_step.sequence) == b"" assert frag_step.step.seq_len == 0 assert frag_step.step.state.name == S.name frag_step = next(i) assert bytes(frag_step.sequence) == b"A" assert frag_step.step.seq_len == 1 assert frag_step.step.state.name == M1.name
def test_hmm(): abc = BaseAlphabet.create(b"ACGU", b"X") baset = BaseLprob.create(abc, (log(0.25), log(0.25), log(0.25), log(0.25))) codonp = CodonLprob.create(abc) codonp.set_lprob(Codon.create(b"AUG", abc), log(0.8)) codonp.set_lprob(Codon.create(b"AUU", abc), log(0.1)) B = MuteState.create(b"B", abc) M1 = FrameState.create(b"M1", baset, CodonMarg.create(codonp), 0.02) M2 = FrameState.create(b"M2", baset, CodonMarg.create(codonp), 0.01) E = MuteState.create(b"E", abc) hmm = HMM.create(abc) hmm.add_state(B, log(0.5)) hmm.add_state(M1) hmm.add_state(M2) hmm.add_state(E) hmm.set_transition(B, M1, log(0.8)) hmm.set_transition(B, M2, log(0.2)) hmm.set_transition(M1, M2, log(0.1)) hmm.set_transition(M1, E, log(0.4)) hmm.set_transition(M2, E, log(0.3)) dp = hmm.create_dp(E) task = DPTask.create(dp) task.setup(Sequence.create(b"AUGAUU", abc)) result = dp.viterbi(task) loglik = hmm.loglikelihood(task.sequence, result.path) assert_allclose(loglik, -7.069201008427531)
def test_hmm_states(): alphabet = Alphabet.create(b"ACGU", b"X") hmm = HMM.create(alphabet) S = MuteState.create(b"S", alphabet) hmm.add_state(S) seqt = SequenceTable.create(alphabet) seqt.add(Sequence.create(b"AGU", alphabet), log(0.8)) seqt.add(Sequence.create(b"AGG", alphabet), log(0.2)) M = TableState.create(b"M", seqt) hmm.add_state(M) with pytest.raises(ValueError): hmm.add_state(S) with pytest.raises(ValueError): hmm.add_state(M) assert len(hmm.states()) == 2
def test_hmm_viterbi_2(): alphabet = Alphabet.create(b"AC", b"X") hmm = HMM.create(alphabet) S = MuteState.create(b"S", alphabet) hmm.add_state(S, log(1.0)) E = MuteState.create(b"E", alphabet) hmm.add_state(E, lprob_zero()) M1 = NormalState.create(b"M1", alphabet, [log(0.8), log(0.2)]) hmm.add_state(M1, lprob_zero()) M2 = NormalState.create(b"M2", alphabet, [log(0.4), log(0.6)]) hmm.add_state(M2, lprob_zero()) hmm.set_transition(S, M1, log(1.0)) hmm.set_transition(M1, M2, log(1.0)) hmm.set_transition(M2, E, log(1.0)) hmm.set_transition(E, E, log(1.0)) hmm.normalize() hmm.set_transition(E, E, lprob_zero()) dp = hmm.create_dp(E) dp_task = DPTask.create(dp) seq = Sequence.create(b"AC", alphabet) dp_task.setup(seq) r = dp.viterbi(dp_task) assert_allclose(hmm.loglikelihood(seq, r.path), log(0.48)) seq = Sequence.create(b"AA", alphabet) dp_task.setup(seq) r = dp.viterbi(dp_task) assert_allclose(hmm.loglikelihood(seq, r.path), log(0.32)) seq = Sequence.create(b"CA", alphabet) dp_task.setup(seq) r = dp.viterbi(dp_task) assert_allclose(hmm.loglikelihood(seq, r.path), log(0.08)) seq = Sequence.create(b"CC", alphabet) dp_task.setup(seq) r = dp.viterbi(dp_task) assert_allclose(hmm.loglikelihood(seq, r.path), log(0.12)) hmm.set_transition(M1, E, log(1.0)) seq = Sequence.create(b"AC", alphabet) dp = hmm.create_dp(E) dp_task = DPTask.create(dp) dp_task.setup(seq) r = dp.viterbi(dp_task) assert_allclose(hmm.loglikelihood(seq, r.path), log(0.48)) seq = Sequence.create(b"AA", alphabet) dp_task.setup(seq) r = dp.viterbi(dp_task) assert_allclose(hmm.loglikelihood(seq, r.path), log(0.32))
def test_sequence(): alphabet = Alphabet.create(b"ACGT", b"X") seq = Sequence.create(b"ACAAAGATX", alphabet) assert len(seq) == 9 assert bytes(seq) == b"ACAAAGATX" assert str(seq) == "ACAAAGATX" assert repr(seq) == "<Sequence:ACAAAGATX>" Sequence.create(b"ACGXXT", alphabet) with pytest.raises(RuntimeError): Sequence.create(b"ACGWT", alphabet) with pytest.raises(RuntimeError): Sequence.create("ACGTç".encode(), alphabet)
def test_hmm_viterbi_1(): alphabet = Alphabet.create(b"ACGU", b"X") hmm = HMM.create(alphabet) S = MuteState.create(b"S", alphabet) hmm.add_state(S, log(1.0)) E = MuteState.create(b"E", alphabet) hmm.add_state(E, lprob_zero()) M1 = NormalState.create( b"M1", alphabet, [log(0.8), log(0.2), lprob_zero(), lprob_zero()], ) hmm.add_state(M1, lprob_zero()) M2 = NormalState.create( b"M2", alphabet, [log(0.4 / 1.6), log(0.6 / 1.6), lprob_zero(), log(0.6 / 1.6)], ) hmm.add_state(M2, lprob_zero()) hmm.set_transition(S, M1, log(1.0)) hmm.set_transition(M1, M2, log(1.0)) hmm.set_transition(M2, E, log(1.0)) hmm.set_transition(E, E, log(1.0)) hmm.normalize() hmm.set_transition(E, E, lprob_zero()) assert_allclose(hmm.transition(E, E), lprob_zero()) assert_allclose(hmm.transition(S, S), lprob_zero()) assert_allclose(hmm.transition(S, E), lprob_zero()) assert_allclose(hmm.transition(E, S), lprob_zero()) dp = hmm.create_dp(E) dp_task = DPTask.create(dp) seq = Sequence.create(b"AC", alphabet) dp_task.setup(seq) result = dp.viterbi(dp_task) assert_allclose(hmm.loglikelihood(seq, result.path), log(0.3))
def test_normal_state(): alphabet = Alphabet.create(b"ACGT", b"X") state = NormalState.create( b"M0", alphabet, [log(0.1), log(0.2), log(0.3), log(0.3)], ) assert state.name == b"M0" assert_allclose(state.lprob(Sequence.create(b"A", alphabet)), log(0.1)) assert_allclose(state.lprob(Sequence.create(b"C", alphabet)), log(0.2)) assert_allclose(state.lprob(Sequence.create(b"G", alphabet)), log(0.3)) assert_allclose(state.lprob(Sequence.create(b"T", alphabet)), log(0.3)) assert state.min_seq == 1 assert state.max_seq == 1 with pytest.raises(RuntimeError): state.lprob(Sequence.create(b"T", Alphabet.create(b"ACGT", b"X"))) assert lprob_is_zero(state.lprob(Sequence.create(b"AC", alphabet))) assert str(state) == "M0" assert repr(state) == "<NormalState:M0>"
def test_hmm_loglikelihood(): alphabet = Alphabet.create(b"ACGU", b"X") hmm = HMM.create(alphabet) S = MuteState.create(b"S", alphabet) hmm.add_state(S, log(1.0)) E = MuteState.create(b"E", alphabet) hmm.add_state(E, lprob_zero()) M1 = NormalState.create( b"M1", alphabet, [log(0.8), log(0.2), lprob_zero(), lprob_zero()], ) hmm.add_state(M1, lprob_zero()) M2 = NormalState.create( b"M2", alphabet, [log(0.4 / 1.6), log(0.6 / 1.6), lprob_zero(), log(0.6 / 1.6)] ) hmm.add_state(M2, lprob_zero()) hmm.set_transition(S, M1, log(1.0)) hmm.set_transition(M1, M2, log(1.0)) hmm.set_transition(M2, E, log(1.0)) hmm.set_transition(E, E, log(1.0)) hmm.normalize() p = hmm.loglikelihood( Sequence.create(b"AC", alphabet), Path.create( [ Step.create(S, 0), Step.create(M1, 1), Step.create(M2, 1), Step.create(E, 0), ] ), ) assert_allclose(p, log(0.3)) p = hmm.loglikelihood( Sequence.create(b"AA", alphabet), Path.create( [ Step.create(S, 0), Step.create(M1, 1), Step.create(M2, 1), Step.create(E, 0), ] ), ) assert_allclose(p, log(0.2)) p = hmm.loglikelihood( Sequence.create(b"AG", alphabet), Path.create( [ Step.create(S, 0), Step.create(M1, 1), Step.create(M2, 1), Step.create(E, 0), ] ), ) assert_allclose(p, lprob_zero()) p = hmm.loglikelihood( Sequence.create(b"AU", alphabet), Path.create( [ Step.create(S, 0), Step.create(M1, 1), Step.create(M2, 1), Step.create(E, 0), ] ), ) assert_allclose(p, log(0.3)) p = hmm.loglikelihood( Sequence.create(b"CC", alphabet), Path.create( [ Step.create(S, 0), Step.create(M1, 1), Step.create(M2, 1), Step.create(E, 0), ] ), ) assert_allclose(p, log(0.075)) p = hmm.loglikelihood( Sequence.create(b"CA", alphabet), Path.create( [ Step.create(S, 0), Step.create(M1, 1), Step.create(M2, 1), Step.create(E, 0), ] ), ) assert_allclose(p, log(0.05)) p = hmm.loglikelihood( Sequence.create(b"CG", alphabet), Path.create( [ Step.create(S, 0), Step.create(M1, 1), Step.create(M2, 1), Step.create(E, 0), ] ), ) assert_allclose(p, lprob_zero()) p = hmm.loglikelihood( Sequence.create(b"CG", alphabet), Path.create( [ Step.create(S, 0), Step.create(M1, 1), Step.create(M2, 1), Step.create(E, 0), ] ), ) assert_allclose(p, lprob_zero()) p = hmm.loglikelihood( Sequence.create(b"CU", alphabet), Path.create( [ Step.create(S, 0), Step.create(M1, 1), Step.create(M2, 1), Step.create(E, 0), ] ), ) assert_allclose(p, log(0.075)) p = hmm.loglikelihood( Sequence.create(b"GC", alphabet), Path.create( [ Step.create(S, 0), Step.create(M1, 1), Step.create(M2, 1), Step.create(E, 0), ] ), ) assert_allclose(p, lprob_zero()) p = hmm.loglikelihood( Sequence.create(b"GA", alphabet), Path.create( [ Step.create(S, 0), Step.create(M1, 1), Step.create(M2, 1), Step.create(E, 0), ] ), ) assert_allclose(p, lprob_zero()) p = hmm.loglikelihood( Sequence.create(b"GG", alphabet), Path.create( [ Step.create(S, 0), Step.create(M1, 1), Step.create(M2, 1), Step.create(E, 0), ] ), ) assert_allclose(p, lprob_zero()) p = hmm.loglikelihood( Sequence.create(b"GU", alphabet), Path.create( [ Step.create(S, 0), Step.create(M1, 1), Step.create(M2, 1), Step.create(E, 0), ] ), ) assert_allclose(p, lprob_zero()) p = hmm.loglikelihood( Sequence.create(b"UC", alphabet), Path.create( [ Step.create(S, 0), Step.create(M1, 1), Step.create(M2, 1), Step.create(E, 0), ] ), ) assert_allclose(p, lprob_zero()) p = hmm.loglikelihood( Sequence.create(b"UA", alphabet), Path.create( [ Step.create(S, 0), Step.create(M1, 1), Step.create(M2, 1), Step.create(E, 0), ] ), ) assert_allclose(p, lprob_zero()) p = hmm.loglikelihood( Sequence.create(b"UG", alphabet), Path.create( [ Step.create(S, 0), Step.create(M1, 1), Step.create(M2, 1), Step.create(E, 0), ] ), ) assert_allclose(p, lprob_zero()) p = hmm.loglikelihood( Sequence.create(b"UU", alphabet), Path.create( [ Step.create(S, 0), Step.create(M1, 1), Step.create(M2, 1), Step.create(E, 0), ] ), ) assert_allclose(p, lprob_zero()) M3 = NormalState.create( b"M2", alphabet, [log(0.4), log(0.6), lprob_zero(), log(0.6)], ) with pytest.raises(ValueError): hmm.loglikelihood( Sequence.create(b"UU", alphabet), Path.create( [ Step.create(S, 0), Step.create(M1, 1), Step.create(M3, 1), Step.create(E, 0), ] ), )
def test_frame_state(): base = BaseAlphabet.create(b"ACGU", b"X") basep = BaseLprob.create(base, (log(0.25), log(0.25), log(0.25), log(0.25))) codonp = CodonLprob.create(base) codonp.set_lprob(Codon.create(b"AUG", base), log(0.8)) codonp.set_lprob(Codon.create(b"AUU", base), log(0.1)) frame_state = FrameState.create(b"M1", basep, CodonMarg.create(codonp), 0.0) assert lprob_is_zero(frame_state.lprob(Sequence.create(b"AUA", base))) assert_allclose(frame_state.lprob(Sequence.create(b"AUG", base)), log(0.8)) assert_allclose(frame_state.lprob(Sequence.create(b"AUU", base)), log(0.1)) assert lprob_is_zero(frame_state.lprob(Sequence.create(b"AU", base))) assert lprob_is_zero(frame_state.lprob(Sequence.create(b"A", base))) assert lprob_is_zero(frame_state.lprob(Sequence.create(b"AUUA", base))) assert lprob_is_zero(frame_state.lprob(Sequence.create(b"AUUAA", base))) codonp.normalize() frame_state = FrameState.create(b"M1", basep, CodonMarg.create(codonp), 0.1) assert_allclose(frame_state.lprob(Sequence.create(b"AUA", base)), -6.905597115665666) assert_allclose(frame_state.lprob(Sequence.create(b"AUG", base)), -0.5347732882047062, rtol=1e-6) assert_allclose(frame_state.lprob(Sequence.create(b"AUU", base)), -2.5902373304999466, rtol=1e-6) assert_allclose(frame_state.lprob(Sequence.create(b"AU", base)), -2.9158434238698336) assert_allclose(frame_state.lprob(Sequence.create(b"A", base)), -5.914503505971854) assert_allclose(frame_state.lprob(Sequence.create(b"AUUA", base)), -6.881032208841384) assert_allclose(frame_state.lprob(Sequence.create(b"AUUAA", base)), -12.08828960987379) assert lprob_is_zero(frame_state.lprob(Sequence.create(b"AUUAAA", base))) lprob, codon = frame_state.decode(Sequence.create(b"AUA", base)) assert_allclose(lprob, -7.128586690537968) assert codon.symbols == b"AUG" lprob, codon = frame_state.decode(Sequence.create(b"AUAG", base)) assert_allclose(lprob, -4.813151489562624) assert codon.symbols == b"AUG" lprob, codon = frame_state.decode(Sequence.create(b"A", base)) assert_allclose(lprob, -6.032286541628237) assert codon.symbols == b"AUG" lprob, codon = frame_state.decode(Sequence.create(b"UUU", base)) assert_allclose(lprob, -8.110186062956258) assert codon.symbols == b"AUU"
def create_sequence(self, sequence: bytes) -> Sequence: return Sequence.create(sequence, self.alphabet)
def test_hmm_viterbi_3(): alphabet = Alphabet.create(b"AC", b"X") hmm = HMM.create(alphabet) S = MuteState.create(b"S", alphabet) hmm.add_state(S, log(1.0)) E = MuteState.create(b"E", alphabet) hmm.add_state(E, lprob_zero()) M1 = NormalState.create(b"M1", alphabet, [log(0.8), log(0.2)]) hmm.add_state(M1, lprob_zero()) D1 = MuteState.create(b"D1", alphabet) hmm.add_state(D1, lprob_zero()) M2 = NormalState.create(b"M2", alphabet, [log(0.4), log(0.6)]) hmm.add_state(M2, lprob_zero()) D2 = MuteState.create(b"D2", alphabet) hmm.add_state(D2, lprob_zero()) hmm.set_transition(S, M1, log(0.8)) hmm.set_transition(S, D1, log(0.2)) hmm.set_transition(M1, M2, log(0.8)) hmm.set_transition(M1, D2, log(0.2)) hmm.set_transition(D1, D2, log(0.2)) hmm.set_transition(D1, M2, log(0.8)) hmm.set_transition(D2, E, log(1.0)) hmm.set_transition(M2, E, log(1.0)) hmm.set_transition(E, E, log(1.0)) hmm.normalize() hmm.set_transition(E, E, lprob_zero()) dp = hmm.create_dp(E) dp_task = DPTask.create(dp) seq = Sequence.create(b"AC", alphabet) dp_task.setup(seq) result = dp.viterbi(dp_task) score = hmm.loglikelihood(seq, result.path) assert bytes(result.sequence) == b"AC" path = result.path steps = list(path) assert steps[0].seq_len == 0 assert steps[1].seq_len == 1 assert steps[2].seq_len == 1 assert steps[3].seq_len == 0 assert_allclose(score, log(0.3072)) seq = Sequence.create(b"AA", alphabet) dp_task.setup(seq) r = dp.viterbi(dp_task) assert_allclose(hmm.loglikelihood(seq, r.path), log(0.2048)) seq = Sequence.create(b"A", alphabet) dp_task.setup(seq) r = dp.viterbi(dp_task) assert_allclose(hmm.loglikelihood(seq, r.path), log(0.128)) seq = Sequence.create(b"AC", alphabet) dp_task.setup(seq) r = dp.viterbi(dp_task) assert_allclose(hmm.loglikelihood(seq, r.path), log(0.3072)) dp = hmm.create_dp(M2) dp_task = DPTask.create(dp) seq = Sequence.create(b"AC", alphabet) dp_task.setup(seq) r = dp.viterbi(dp_task) assert_allclose(hmm.loglikelihood(seq, r.path), log(0.3072)) hmm.del_state(E) dp = hmm.create_dp(M2) dp_task = DPTask.create(dp) seq = Sequence.create(b"AC", alphabet) dp_task.setup(seq) result = dp.viterbi(dp_task) assert_allclose(hmm.loglikelihood(seq, result.path), log(0.3072))