def test_alignment_str(): seq1 = seq.NucleotideSequence("ACCTGA") seq2 = seq.NucleotideSequence("TATGCT") ali_str = ["A-CCTGA----", "----T-ATGCT"] trace = align.Alignment.trace_from_strings(ali_str) alignment = align.Alignment([seq1, seq2], trace, None) assert str(alignment).split("\n") == ali_str
def plot_pb_scheme_alignment(): random.seed(1) scheme_file = biotite.temp_file("json") mat_file = biotite.temp_file("mat") with open(mat_file, "w") as file: # PB substitution matrix, adapted from PBxplore file.write(""" a b c d e f g h i j k l m n o p a 516 -59 113 -105 -411 -177 -27 -361 47 -103 -644 -259 -599 -372 -124 -83 b -59 541 -146 -210 -155 -310 -97 90 182 -128 -30 29 -745 -242 -165 22 c 113 -146 360 -14 -333 -240 49 -438 -269 -282 -688 -682 -608 -455 -147 6 d -105 -210 -14 221 5 -131 -349 -278 -253 -173 -585 -670 -1573 -1048 -691 -497 e -411 -155 -333 5 520 185 186 138 -378 -70 -112 -514 -1136 -469 -617 -632 f -177 -310 -240 -131 185 459 -99 -45 -445 83 -214 -88 -547 -629 -406 -552 g -27 -97 49 -349 186 -99 665 -99 -89 -118 -409 -138 -124 172 128 254 h -361 90 -438 -278 138 -45 -99 632 -205 316 192 -108 -712 -359 95 -399 i 47 182 -269 -253 -378 -445 -89 -205 696 186 8 15 -709 -269 -169 226 j -103 -128 -282 -173 -70 83 -118 316 186 768 196 5 -398 -340 -117 -104 k -644 -30 -688 -585 -112 -214 -409 192 8 196 568 -65 -270 -231 -471 -382 l -259 29 -682 -670 -514 -88 -138 -108 15 5 -65 533 -131 8 -11 -316 m -599 -745 -608 -1573 -1136 -547 -124 -712 -709 -398 -270 -131 241 -4 -190 -155 n -372 -242 -455 -1048 -469 -629 172 -359 -269 -340 -231 8 -4 703 88 146 o -124 -165 -147 -691 -617 -406 128 95 -169 -117 -471 -11 -190 88 716 58 p -83 22 6 -497 -632 -552 254 -399 226 -104 -382 -316 -155 146 58 609 """) gecli.main(args=[ "--alphabet", "abcdefghijklmnop", "--matrix", mat_file, "--contrast", "300", "--lmin", "65", "--lmax", "70", "-f", scheme_file ]) colors = graphics.load_color_scheme(scheme_file)["colors"] fig = plt.figure(figsize=(8.0, 5.0)) ax = fig.gca() pb_alphabet = seq.LetterAlphabet("abcdefghijklmnop") fasta_file = fasta.FastaFile() fasta_file.read(PB_EXAMPLE_FILE_NAME) seq_strings = list(fasta_file.values()) sequences = [ seq.GeneralSequence(pb_alphabet, seq_str.replace("-", "")) for seq_str in seq_strings ] trace = align.Alignment.trace_from_strings(seq_strings) alignment = align.Alignment(sequences, trace, score=None) graphics.plot_alignment_type_based(ax, alignment, symbols_per_line=60, spacing=2, color_scheme=colors) fig.tight_layout() return fig
def test_identity(): seq_str1 = "--HAKLPRDD--WL--" seq_str2 = "FRHA--QRTDADWLHH" seq_strings = [seq_str1, seq_str2] sequences = [ seq.ProteinSequence(seq_str.replace("-", "")) for seq_str in seq_strings ] trace = align.Alignment.trace_from_strings(seq_strings) alignment = align.Alignment(sequences, trace, score=None) # Assert correct sequence identity calculation modes = ["all", "not_terminal", "shortest"] values = [6 / 16, 6 / 12, 6 / 10] for mode, value in zip(modes, values): assert align.get_sequence_identity(alignment, mode=mode) == value
def test_from_alignment(): seq1 = seq.NucleotideSequence("CGTCAT") seq2 = seq.NucleotideSequence("TCATGC") ali_str = ["CGTCAT--", "--TCATGC"] trace = align.Alignment.trace_from_strings(ali_str) alignment = align.Alignment([seq1, seq2], trace, None) profile = seq.SequenceProfile.from_alignment(alignment) symbols = np.array([[0, 1, 0, 0], [0, 0, 1, 0], [0, 0, 0, 2], [0, 2, 0, 0], [2, 0, 0, 0], [0, 0, 0, 2], [0, 0, 1, 0], [0, 1, 0, 0]]) gaps = np.array([1, 1, 0, 0, 0, 0, 1, 1]) alphabet = seq.Alphabet(["A", "C", "G", "T"]) assert np.array_equal(symbols, profile.symbols) assert np.array_equal(gaps, profile.gaps) assert (alphabet == profile.alphabet)
def test_conversion_to_symbols(): """ Test conversion of alignments to strings. """ seq_str1 = "HAKLPRDD--WKL--" seq_str2 = "HA--PRDDADWKLHH" seq_str3 = "HA----DDADWKLHH" seq_strings = [seq_str1, seq_str2, seq_str3] sequences = [seq.ProteinSequence(seq_str.replace("-","")) for seq_str in seq_strings] trace = align.Alignment.trace_from_strings(seq_strings) alignment = align.Alignment(sequences, trace, score=None) # Test the conversion bach to strings of symbols symbols = align.get_symbols(alignment) symbols = ["".join([sym if sym is not None else "-" for sym in sym_list]) for sym_list in symbols] assert symbols == seq_strings
def test_simple_alignments(seq_type, seq1, seq2, seed, threshold, ref_range1, ref_range2, direction, score_only, uint8_code): """ Check if `algin_local_ungapped()` produces correct alignments based on simple known examples. """ # Limit start or stop reference alignment range to seed # if the alignment does not extend in both directions if direction == "upstream": ref_range1 = (ref_range1[0], seed[0] + 1) ref_range2 = (ref_range2[0], seed[1] + 1) elif direction == "downstream": ref_range1 = (seed[0], ref_range1[1]) ref_range2 = (seed[1], ref_range2[1]) seq1 = seq_type(seq1) seq2 = seq_type(seq2) if seq_type == seq.NucleotideSequence: matrix = align.SubstitutionMatrix.std_nucleotide_matrix() else: matrix = align.SubstitutionMatrix.std_protein_matrix() if not uint8_code: seq1, seq2, matrix = _convert_to_uint16_code(seq1, seq2, matrix) ref_alignment = align.Alignment( [seq1, seq2], np.stack([ np.arange(*ref_range1), np.arange(*ref_range2) ], axis=-1) ) ref_score = align.score(ref_alignment, matrix) ref_alignment.score = ref_score test_result = align.align_local_ungapped( seq1, seq2, matrix, seed, threshold, direction, score_only) if score_only: assert test_result == ref_score else: assert test_result == ref_alignment
seq.NucleotideSequence("tttacggctagctcagtcctaggtactatgctagc"), seq.NucleotideSequence("tttacggctagctcagtcctaggtatagtgctagc"), seq.NucleotideSequence("tttacggctagctcagccctaggtattatgctagc"), seq.NucleotideSequence("ctgacagctagctcagtcctaggtataatgctagc"), seq.NucleotideSequence("tttacagctagctcagtcctagggactgtgctagc"), seq.NucleotideSequence("tttacggctagctcagtcctaggtacaatgctagc"), seq.NucleotideSequence("ttgacggctagctcagtcctaggtatagtgctagc"), seq.NucleotideSequence("ctgatagctagctcagtcctagggattatgctagc"), seq.NucleotideSequence("ctgatggctagctcagtcctagggattatgctagc"), seq.NucleotideSequence("tttatggctagctcagtcctaggtacaatgctagc"), seq.NucleotideSequence("tttatagctagctcagcccttggtacaatgctagc"), seq.NucleotideSequence("ttgacagctagctcagtcctagggactatgctagc"), seq.NucleotideSequence("ttgacagctagctcagtcctagggattgtgctagc"), seq.NucleotideSequence("ttgacggctagctcagtcctaggtattgtgctagc")] # Sequences do not need to be aligned # -> Create alignment with trivial trace # [[0 0 0 ...] # [1 1 1 ...] # [2 2 2 ...] # ... ] alignment = align.Alignment( sequences = seqs, trace = np.tile(np.arange(len(seqs[0])), len(seqs)) \ .reshape(len(seqs), len(seqs[0])) \ .transpose(), score = 0 ) # Create sequence logo from alignment logo = graphics.SequenceLogo(alignment, 800, 100) fig = logo.generate() plt.show()
def test_new_position_matrices(): seqs = [ seq.NucleotideSequence("AAGAAT"), seq.NucleotideSequence("ATCATA"), seq.NucleotideSequence("AAGTAA"), seq.NucleotideSequence("AACAAA"), seq.NucleotideSequence("ATTAAA"), seq.NucleotideSequence("AAGAAT") ] alignment = align.Alignment( sequences=seqs, trace=np.tile(np.arange(len(seqs[0])), len(seqs)) \ .reshape(len(seqs), len(seqs[0])) \ .transpose(), score=0 ) profile = seq.SequenceProfile.from_alignment(alignment) probability_matrix = np.array([[ 1., 0., 0., 0., ], [0.66666667, 0., 0., 0.33333333], [0., 0.33333333, 0.5, 0.16666667], [0.83333333, 0., 0., 0.16666667], [0.83333333, 0., 0., 0.16666667], [0.66666667, 0., 0., 0.33333333]]) ppm = profile.probability_matrix() assert np.allclose(probability_matrix, ppm, atol=1e-3) probability = profile.sequence_probability( seq.NucleotideSequence("AAAAAA")) assert probability == 0.0 ppm = profile.probability_matrix(pseudocount=1) probability_matrix = np.array( [[0.89285714, 0.03571429, 0.03571429, 0.03571429], [0.60714286, 0.03571429, 0.03571429, 0.32142857], [0.03571429, 0.32142857, 0.46428571, 0.17857143], [0.75, 0.03571429, 0.03571429, 0.17857143], [0.75, 0.03571429, 0.03571429, 0.17857143], [0.60714286, 0.03571429, 0.03571429, 0.32142857]]) assert np.allclose(probability_matrix, ppm, atol=1e-3) probability = profile.sequence_probability( seq.NucleotideSequence("AAAAAA"), pseudocount=1) assert probability == pytest.approx(0.0066, abs=1e-3) log_odds_matrix = np.array( [[1.83650127, -2.80735492, -2.80735492, -2.80735492], [1.28010792, -2.80735492, -2.80735492, 0.36257008], [-2.80735492, 0.36257008, 0.8930848, -0.48542683], [1.5849625, -2.80735492, -2.80735492, -0.48542683], [1.5849625, -2.80735492, -2.80735492, -0.48542683], [1.28010792, -2.80735492, -2.80735492, 0.36257008]]) pwm = profile.log_odds_matrix(pseudocount=1) assert np.allclose(log_odds_matrix, pwm, atol=1e-3) score = profile.sequence_score(seq.NucleotideSequence("AAAAAA"), pseudocount=1) assert score == pytest.approx(4.7593, abs=1e-3)