def test_compute_score_and_traceback_matrices(self): # these results were computed manually expected_score_m = [[0, -5, -7, -9], [-5, 2, -3, -5], [-7, -3, 4, -1], [-9, -5, -1, 6], [-11, -7, -3, 1]] expected_tback_m = [[0, 3, 3, 3], [2, 1, 3, 3], [2, 2, 1, 3], [2, 2, 2, 1], [2, 2, 2, 2]] m = make_identity_substitution_matrix(2, -1) actual_score_m, actual_tback_m = _compute_score_and_traceback_matrices( TabularMSA([DNA('ACG', metadata={'id': 'id'})]), TabularMSA([DNA('ACGT', metadata={'id': 'id'})]), 5, 2, m) np.testing.assert_array_equal(actual_score_m, expected_score_m) np.testing.assert_array_equal(actual_tback_m, expected_tback_m) # different sequences # these results were computed manually expected_score_m = [[0, -5, -7, -9], [-5, 2, -3, -5], [-7, -3, 4, -1], [-9, -5, -1, 3], [-11, -7, -3, -2]] expected_tback_m = [[0, 3, 3, 3], [2, 1, 3, 3], [2, 2, 1, 3], [2, 2, 2, 1], [2, 2, 2, 1]] m = make_identity_substitution_matrix(2, -1) actual_score_m, actual_tback_m = _compute_score_and_traceback_matrices( TabularMSA([DNA('ACC', metadata={'id': 'id'})]), TabularMSA([DNA('ACGT', metadata={'id': 'id'})]), 5, 2, m) np.testing.assert_array_equal(actual_score_m, expected_score_m) np.testing.assert_array_equal(actual_tback_m, expected_tback_m) # four sequences provided in two alignments # these results were computed manually expected_score_m = [[0, -5, -7, -9], [-5, 2, -3, -5], [-7, -3, 4, -1], [-9, -5, -1, 3], [-11, -7, -3, -2]] expected_tback_m = [[0, 3, 3, 3], [2, 1, 3, 3], [2, 2, 1, 3], [2, 2, 2, 1], [2, 2, 2, 1]] m = make_identity_substitution_matrix(2, -1) actual_score_m, actual_tback_m = _compute_score_and_traceback_matrices( TabularMSA([DNA('ACC', metadata={'id': 's1'}), DNA('ACC', metadata={'id': 's2'})]), TabularMSA([DNA('ACGT', metadata={'id': 's3'}), DNA('ACGT', metadata={'id': 's4'})]), 5, 2, m) np.testing.assert_array_equal(actual_score_m, expected_score_m) np.testing.assert_array_equal(actual_tback_m, expected_tback_m)
def test_make_identity_substitution_matrix(self): expected = {'A': {'A': 1, 'C': -2, 'G': -2, 'T': -2, 'U': -2}, 'C': {'A': -2, 'C': 1, 'G': -2, 'T': -2, 'U': -2}, 'G': {'A': -2, 'C': -2, 'G': 1, 'T': -2, 'U': -2}, 'T': {'A': -2, 'C': -2, 'G': -2, 'T': 1, 'U': -2}, 'U': {'A': -2, 'C': -2, 'G': -2, 'T': -2, 'U': 1}} self.assertEqual(make_identity_substitution_matrix(1, -2), expected) expected = {'A': {'A': 5, 'C': -4, 'G': -4, 'T': -4, 'U': -4}, 'C': {'A': -4, 'C': 5, 'G': -4, 'T': -4, 'U': -4}, 'G': {'A': -4, 'C': -4, 'G': 5, 'T': -4, 'U': -4}, 'T': {'A': -4, 'C': -4, 'G': -4, 'T': 5, 'U': -4}, 'U': {'A': -4, 'C': -4, 'G': -4, 'T': -4, 'U': 5}} self.assertEqual(make_identity_substitution_matrix(5, -4), expected)
def test_nucleotide_aligners_use_substitution_matrices(self): alt_sub = make_identity_substitution_matrix(10, -10) # alternate substitution matrix yields different alignment (the # aligned sequences and the scores are different) with local alignment actual_no_sub = local_pairwise_align_nucleotide( "GACCTTGACCAGGTACC", "GAACTTTGACGTAAC", gap_open_penalty=10., gap_extend_penalty=5., match_score=5, mismatch_score=-4) actual_alt_sub = local_pairwise_align_nucleotide( "GACCTTGACCAGGTACC", "GAACTTTGACGTAAC", gap_open_penalty=10., gap_extend_penalty=5., match_score=5, mismatch_score=-4, substitution_matrix=alt_sub) self.assertNotEqual(str(actual_no_sub[0]), str(actual_alt_sub[0])) self.assertNotEqual(str(actual_no_sub[1]), str(actual_alt_sub[1])) self.assertNotEqual(actual_no_sub.score(), actual_alt_sub.score()) # alternate substitution matrix yields different alignment (the # aligned sequences and the scores are different) with global alignment actual_no_sub = local_pairwise_align_nucleotide( "GACCTTGACCAGGTACC", "GAACTTTGACGTAAC", gap_open_penalty=10., gap_extend_penalty=5., match_score=5, mismatch_score=-4) actual_alt_sub = global_pairwise_align_nucleotide( "GACCTTGACCAGGTACC", "GAACTTTGACGTAAC", gap_open_penalty=10., gap_extend_penalty=5., match_score=5, mismatch_score=-4, substitution_matrix=alt_sub) self.assertNotEqual(str(actual_no_sub[0]), str(actual_alt_sub[0])) self.assertNotEqual(str(actual_no_sub[1]), str(actual_alt_sub[1])) self.assertNotEqual(actual_no_sub.score(), actual_alt_sub.score())
def test_compute_score_and_traceback_matrices_invalid(self): # if the sequence contains a character that is not in the # substitution matrix, an informative error should be raised m = make_identity_substitution_matrix(2, -1) self.assertRaises(ValueError, _compute_score_and_traceback_matrices, Alignment([DNA('AWG')]), Alignment([DNA('ACGT')]), 5, 2, m)
def getStartPosMapper(seq, subst=None): """Factory that returns a function to align peptides to seq. Can be used as the mapping function for a peptide column in a DataFrame, to align the column to a reference sequence Parameters ---------- seq : str AA sequence. subst : dict of dicts Scores for each pair of AAs in peptide and sequence. Returns ------- findPos : function Function with one argument: a peptide sequence to align.""" if subst is None: subst = make_identity_substitution_matrix(1, -1, alphabet=AALPHABET) def findPos(pep): d = ssw(pep) return int(d['query_begin'] - d['target_begin']) ssw = StripedSmithWaterman(query_sequence=seq, protein=True, substitution_matrix=subst) return findPos
def test_compute_substitution_score(self): # these results were computed manually subs_m = make_identity_substitution_matrix(5, -4) self.assertEqual( _compute_substitution_score(['A'], ['A'], subs_m, 0), 5.0) self.assertEqual( _compute_substitution_score(['A', 'A'], ['A'], subs_m, 0), 5.0) self.assertEqual( _compute_substitution_score(['A', 'C'], ['A'], subs_m, 0), 0.5) self.assertEqual( _compute_substitution_score(['A', 'C'], ['A', 'C'], subs_m, 0), 0.5) self.assertEqual( _compute_substitution_score(['A', 'A'], ['A', '-'], subs_m, 0), 2.5) self.assertEqual( _compute_substitution_score(['A', 'A'], ['A', '-'], subs_m, 1), 3) # alt subs_m subs_m = make_identity_substitution_matrix(1, -2) self.assertEqual( _compute_substitution_score(['A', 'A'], ['A', '-'], subs_m, 0), 0.5)
def test_global_pairwise_align_custom_alphabet_nondegenerate_chars(self): custom_substitution_matrix = make_identity_substitution_matrix( 1, -1, alphabet=CustomSequence.nondegenerate_chars) custom_msa, custom_score, custom_start_end = global_pairwise_align( CustomSequence("WXYZ"), CustomSequence("WXYYZZ"), 10.0, 5.0, custom_substitution_matrix) # Expected values computed by running an equivalent alignment using the # DNA alphabet with the following mapping: # # W X Y Z # | | | | # A C G T # self.assertEqual(custom_msa, TabularMSA([CustomSequence('WXYZ^^'), CustomSequence('WXYYZZ')])) self.assertEqual(custom_score, 2.0) self.assertEqual(custom_start_end, [(0, 3), (0, 5)])
def test_local_pairwise_align_custom_alphabet(self): custom_substitution_matrix = make_identity_substitution_matrix( 5, -4, alphabet=CustomSequence.definite_chars) custom_msa, custom_score, custom_start_end = local_pairwise_align( CustomSequence("YWXXZZYWXXWYYZWXX"), CustomSequence("YWWXZZZYWXYZWWX"), 5.0, 0.5, custom_substitution_matrix) # Expected values computed by running an equivalent alignment using the # DNA alphabet with the following mapping: # # W X Y Z # | | | | # A C G T # self.assertEqual( custom_msa, TabularMSA([CustomSequence('WXXZZYWXXWYYZWXX'), CustomSequence('WXZZZYWX^^^YZWWX')])) self.assertEqual(custom_score, 41.0) self.assertEqual(custom_start_end, [(1, 16), (2, 14)])
def test_nucleotide_aligners_use_substitution_matrices(self): alt_sub = make_identity_substitution_matrix(10, -10) # alternate substitution matrix yields different alignment (the # aligned sequences and the scores are different) with local alignment msa_no_sub, score_no_sub, start_end_no_sub = \ local_pairwise_align_nucleotide( DNA("GACCTTGACCAGGTACC"), DNA("GAACTTTGACGTAAC"), gap_open_penalty=10., gap_extend_penalty=5., match_score=5, mismatch_score=-4) msa_alt_sub, score_alt_sub, start_end_alt_sub = \ local_pairwise_align_nucleotide( DNA("GACCTTGACCAGGTACC"), DNA("GAACTTTGACGTAAC"), gap_open_penalty=10., gap_extend_penalty=5., match_score=5, mismatch_score=-4, substitution_matrix=alt_sub) self.assertNotEqual(msa_no_sub, msa_alt_sub) self.assertNotEqual(score_no_sub, score_alt_sub) self.assertNotEqual(start_end_no_sub, start_end_alt_sub) # alternate substitution matrix yields different alignment (the # aligned sequences and the scores are different) with global alignment msa_no_sub, score_no_sub, start_end_no_sub = \ global_pairwise_align_nucleotide( DNA("GACCTTGACCAGGTACC"), DNA("GAACTTTGACGTAAC"), gap_open_penalty=10., gap_extend_penalty=5., match_score=5, mismatch_score=-4) msa_alt_sub, score_alt_sub, start_end_alt_sub = \ global_pairwise_align_nucleotide( DNA("GACCTTGACCAGGTACC"), DNA("GAACTTTGACGTAAC"), gap_open_penalty=10., gap_extend_penalty=5., match_score=5, mismatch_score=-4, substitution_matrix=alt_sub) self.assertNotEqual(msa_no_sub, msa_alt_sub) self.assertNotEqual(score_no_sub, score_alt_sub) self.assertEqual(start_end_no_sub, start_end_alt_sub)
def test_make_identity_substitution_matrix(self): expected = { 'A': { 'A': 1, 'C': -2, 'G': -2, 'T': -2, 'U': -2 }, 'C': { 'A': -2, 'C': 1, 'G': -2, 'T': -2, 'U': -2 }, 'G': { 'A': -2, 'C': -2, 'G': 1, 'T': -2, 'U': -2 }, 'T': { 'A': -2, 'C': -2, 'G': -2, 'T': 1, 'U': -2 }, 'U': { 'A': -2, 'C': -2, 'G': -2, 'T': -2, 'U': 1 } } self.assertEqual(make_identity_substitution_matrix(1, -2), expected) expected = { 'A': { 'A': 5, 'C': -4, 'G': -4, 'T': -4, 'U': -4 }, 'C': { 'A': -4, 'C': 5, 'G': -4, 'T': -4, 'U': -4 }, 'G': { 'A': -4, 'C': -4, 'G': 5, 'T': -4, 'U': -4 }, 'T': { 'A': -4, 'C': -4, 'G': -4, 'T': 5, 'U': -4 }, 'U': { 'A': -4, 'C': -4, 'G': -4, 'T': -4, 'U': 5 } } self.assertEqual(make_identity_substitution_matrix(5, -4), expected)
import pandas as pd import argparse import re import skbio from copy import deepcopy import skbio from skbio.alignment import local_pairwise_align_ssw, make_identity_substitution_matrix from skbio.sequence import Protein ident = make_identity_substitution_matrix(match_score=1, mismatch_score=0, alphabet=skbio.sequence.Protein.alphabet) def assembleOverlappingPeptides(pepArr,overlap=11): """This is a work in progress, but the idea was to be able to rebuild the sequence from the set of overlapping 15mers...""" assembled = [pep for pep in pepArr] while len(assembled)>1: for pepi1, pepi2 in itertools.combinations(arange(len(assembled)), 2): pep1, pep2 = assembled[pepi1], assembled[pepi2] res = pairwise2.align.globalxs(pep2, pep1, -4, 0)[0] #print res[2] if res[2]>=overlap-8: #print res[0] #print res[1] _ = assembled.pop(pepi2) assembled[pepi1] = ''.join([aa1 if not aa1=='-' else aa2 for aa1, aa2 in zip(res[0], res[1])]) #print assembled[pepi1] #print break return assembled[0]
string = str(t) + str(pc) pc += 1 hash_val = hashlib.sha256(string.encode()) digest = bytearray(hash_val.digest()) # k - 1 dummy ops tmp = [] for i in range(0, k - 1): tmp.append((digest[i] % 4) + 1) idx = digest[20] % k # tmp.insert(idx, t[0]) tmp.insert(idx, t) x.extend(tmp) return x matrix = make_identity_substitution_matrix(2, 1, '0123456789') calibrate(matrix) for i in x: # ops.add(i[0]) ops.add(i) print("Orig:") for op in x: print(ops_dic[op], end=', ') print("") def list_to_str(a): return ''.join(map(str, a))