def test_hmmer_prof(): buffer = pkg_resources.open_binary(hmmer_reader.data, "three-profs.hmm.gz") content = gzip.decompress(buffer.read()).decode() hmmfile = open_hmmer(StringIO(content)) hmm = hmmfile.read_model() assert hmm.header == "HMMER3/f [3.1b2 | February 2015]" assert dict(hmm.metadata)["LENG"] == "40" assert hmm.M == 40 assert hmm.alphabet == "ACDEFGHIKLMNPQRSTVWY" assert abs(hmm.match(2)["V"] - -2.72416) < 1e-6 assert abs(hmm.insert(2)["V"] - -2.98518) < 1e-6 assert abs(hmm.trans(3)["DD"] - -0.9551) < 1e-6 assert abs(hmm.compo["N"] - -3.18565) < 1e-6 output = str(hmm) assert "SM hmmsearch -Z 45638612 -E 1000 --cpu 4 HMM pfamseq" in output hmm = hmmfile.read_model() assert dict(hmm.metadata)["LENG"] == "235" hmm = hmmfile.read_model() assert dict(hmm.metadata)["LENG"] == "449" buffer.close()
def test_standard_profile_nonhomo_and_homologous(PF03373): with open_hmmer(PF03373) as reader: hmmer = create_profile(reader.read_profile()) alphabet = hmmer.alphabet seq = Sequence(b"KKKPGKEDNNK", alphabet) assert_equal(hmmer.multiple_hits, True) r = hmmer.search(seq) assert_allclose(r.loglikelihood, 10.707618955640605) frags = r.fragments assert_equal(len(frags), 2) assert_equal(frags[0].homologous, False) assert_equal(frags[0].sequence.symbols, b"KKK") assert_equal(frags[1].homologous, True) assert_equal(frags[1].sequence.symbols, b"PGKEDNNK") hmmer.multiple_hits = False assert_equal(hmmer.multiple_hits, False) r = hmmer.search(seq) assert_allclose(r.loglikelihood, 10.96037578075283) frags = r.fragments assert_equal(len(frags), 2) assert_equal(frags[0].homologous, False) assert_equal(frags[0].sequence.symbols, b"KKK") assert_equal(frags[1].homologous, True) assert_equal(frags[1].sequence.symbols, b"PGKEDNNK")
def press(hmm_filepath: Union[Path, str]): hmm_filepath = Path(hmm_filepath) base_abc = nmm.DNAAlphabet() total = num_models(hmm_filepath) epsilon = 0.01 bin_filepath = hmm_filepath.with_suffix(".dcp").name.encode() with Output.create(bin_filepath) as output: with open_hmmer(hmm_filepath) as parser: for hmmer3 in tqdm(parser, total=total, desc="Pressing"): model = HMMERModel(hmmer3) data = dict(hmmer3.metadata) mt = Metadata.create(data["NAME"].encode(), data["ACC"].encode()) prof = create_profile(model, base_abc, 0, epsilon) nprof = DCPProfile.create(base_abc, mt) hmm = prof.alt_model.hmm dp = hmm.create_dp(prof.alt_model.special_node.T) nprof.append_model(imm.Model.create(hmm, dp)) hmm = prof.null_model.hmm dp = hmm.create_dp(prof.null_model.state) nprof.append_model(imm.Model.create(hmm, dp)) output.write(nprof)
def test_frame_profile_frame1(PF03373): with open_hmmer(PF03373) as reader: hmmer = create_profile(reader.read_profile()) # most_likely_seq = b"PGKEDNNK" most_likely_rna_seq = b"CCU GGU AAA GAA GAU AAU AAC AAA" most_likely_rna_seq = most_likely_rna_seq.replace(b" ", b"")
def test_hmmer_reader_invalid_file(): buffer = pkg_resources.open_text(hmmer_reader.data, "A0ALD9.fasta") hmmfile = open_hmmer(buffer) with pytest.raises(ParsingError): hmmfile.read_model() buffer.close()
def test_hmmer_reader_corrupted_file(): buffer = pkg_resources.open_text(hmmer_reader.data, "PF02545.hmm.br.corrupted") hmmfile = open_hmmer(buffer) with pytest.raises(UnicodeDecodeError): hmmfile.read_model() buffer.close()
def test_standard_profile_unihit_homologous_3(PF03373): with open_hmmer(PF03373) as reader: hmmer = create_profile(reader.read_profile()) alphabet = hmmer.alphabet seq = Sequence(b"PGKEPNNK", alphabet) r = hmmer.search(seq) assert_allclose(r.loglikelihood, 6.883636719423446) frags = r.fragments assert_equal(len(frags), 1) frag = frags[0] assert_equal(frag.homologous, True) assert_equal(frag.sequence.symbols, seq.symbols)
def test_standard_profile_unihit_homologous_2(PF03373): with open_hmmer(PF03373) as reader: hmmer = create_profile(reader.read_profile()) alphabet = hmmer.alphabet seq = Sequence(b"PGKENNK", alphabet) r = hmmer.search(seq) assert_allclose(r.loglikelihood, 3.299501501364073) frags = r.fragments assert_equal(len(frags), 1) frag = frags[0] assert_equal(frag.homologous, True) assert_equal(frag.sequence.symbols, seq.symbols) assert_equal(str(frag), "[PGKENNK]")
def test_hmmer_reader(): buffer = pkg_resources.open_binary(hmmer_reader.data, "PF02545.hmm.gz") content = gzip.decompress(buffer.read()).decode() hmmfile = open_hmmer(StringIO(content)) hmm = hmmfile.read_model() assert hmm.header == "HMMER3/f [3.1b2 | February 2015]" assert dict(hmm.metadata)["LENG"] == "166" assert hmm.M == 166 assert hmm.alphabet == "ACDEFGHIKLMNPQRSTVWY" assert abs(hmm.match(2)["V"] - -2.0152) < 1e-6 assert abs(hmm.insert(2)["V"] - -2.98518) < 1e-6 assert abs(hmm.trans(83)["DD"] - -0.94424) < 1e-6 assert abs(hmm.compo["N"] + 3.21795) < 1e-6 output = str(hmm) assert "SM hmmsearch -Z 45638612 -E 1000 --cpu 4 HMM pfamseq" in output buffer.close()
def test_hmmer_reader_nt(): buffer = pkg_resources.open_binary(hmmer_reader.data, "2OG-FeII_Oxy_3-nt.hmm.gz") content = gzip.decompress(buffer.read()).decode() hmmfile = open_hmmer(StringIO(content)) hmm = hmmfile.read_model() assert hmm.header == "HMMER3/f [3.1b2 | February 2015]" assert dict(hmm.metadata)["LENG"] == "315" assert hmm.M == 315 assert hmm.alphabet == "ACGT" assert abs(hmm.match(2)["A"] - -2.35771) < 1e-6 assert abs(hmm.insert(2)["G"] - -1.38629) < 1e-6 assert abs(hmm.trans(83)["DD"] - -0.40547) < 1e-6 assert abs(hmm.compo["T"] - -1.50794) < 1e-6 output = str(hmm) assert "DATE Sun May 24 19:35:19 2015" in output buffer.close()
def test_standard_profile_unihit_homologous_1(PF03373): with open_hmmer(PF03373) as reader: hmmer = create_profile(reader.read_profile()) alphabet = hmmer.alphabet most_likely_seq = Sequence(b"PGKEDNNK", alphabet) r = hmmer.search(most_likely_seq) assert_allclose(r.loglikelihood, 11.867796719423442) frags = r.fragments assert_equal(len(frags), 1) frag = frags[0] assert_equal(frag.homologous, True) assert_equal(frag.sequence.symbols, most_likely_seq.symbols) hmmer.multiple_hits = False r = hmmer.search(most_likely_seq) assert_allclose(r.loglikelihood, 11.94063404337571) frags = r.fragments assert_equal(len(frags), 1) frag = frags[0] assert_equal(frag.homologous, True) assert_equal(frag.sequence.symbols, most_likely_seq.symbols)
def test_standard_profile_multihit_homologous1(PF03373): with open_hmmer(PF03373) as reader: hmmer = create_profile(reader.read_profile()) alphabet = hmmer.alphabet seq = Sequence(b"PPPPGKEDNNKDDDPGKEDNNKEEEE", alphabet) r = hmmer.search(seq) assert_allclose(r.loglikelihood, 20.329227532144742) frags = r.fragments assert_equal(len(frags), 5) assert_equal(frags[0].homologous, False) assert_equal(frags[0].sequence.symbols, b"PPP") assert_equal(frags[1].homologous, True) assert_equal(frags[1].sequence.symbols, b"PGKEDNNK") assert_equal(frags[2].homologous, False) assert_equal(frags[2].sequence.symbols, b"DDD") assert_equal(frags[3].homologous, True) assert_equal(frags[3].sequence.symbols, b"PGKEDNNK") assert_equal(frags[4].homologous, False) assert_equal(frags[4].sequence.symbols, b"EEEE") items = list(frags[0].items()) assert_equal(items[0][0], b"") assert_equal(str(items[0][1]), "<S,0>") assert_equal(items[1][0], b"P") assert_equal(str(items[1][1]), "<N,1>") assert_equal(items[2][0], b"P") assert_equal(str(items[2][1]), "<N,1>") assert_equal(items[3][0], b"P") assert_equal(str(items[3][1]), "<N,1>") assert_equal(items[4][0], b"") assert_equal(str(items[4][1]), "<B,0>") items = list(frags[1].items()) assert_equal(items[0][0], b"P") assert_equal(str(items[0][1]), "<M1,1>") assert_equal(items[1][0], b"G") assert_equal(str(items[1][1]), "<M2,1>") assert_equal(items[2][0], b"K") assert_equal(str(items[2][1]), "<M3,1>") assert_equal(items[3][0], b"E") assert_equal(str(items[3][1]), "<M4,1>") assert_equal(items[4][0], b"D") assert_equal(str(items[4][1]), "<M5,1>") assert_equal(items[5][0], b"N") assert_equal(str(items[5][1]), "<M6,1>") assert_equal(items[6][0], b"N") assert_equal(str(items[6][1]), "<M7,1>") assert_equal(items[7][0], b"K") assert_equal(str(items[7][1]), "<M8,1>") hmmer.multiple_hits = False r = hmmer.search(seq) assert_allclose(r.loglikelihood, 8.666478660222928) frags = r.fragments assert_equal(len(frags), 3) assert_equal(frags[0].homologous, False) assert_equal(frags[1].homologous, True) assert_equal(frags[1].sequence.symbols, b"PGKEDNNK") assert_equal(frags[2].homologous, False)