def test_global_pairwise_align_dtype_mismatch(self): with self.assertRaisesRegex(TypeError, r"same dtype: 'DNA' != 'RNA'"): global_pairwise_align(DNA('ACGT'), TabularMSA([RNA('ACGU')]), 1.0, 1.0, {}) with self.assertRaisesRegex(TypeError, r"same dtype: 'DNA' != 'RNA'"): global_pairwise_align(TabularMSA([DNA('ACGT')]), TabularMSA([RNA('ACGU')]), 1.0, 1.0, {})
def test_global_pairwise_align_dtype_mismatch(self): with self.assertRaisesRegex(TypeError, "same dtype: 'DNA' != 'RNA'"): global_pairwise_align(DNA('ACGT'), TabularMSA([RNA('ACGU')]), 1.0, 1.0, {}) with self.assertRaisesRegex(TypeError, "same dtype: 'DNA' != 'RNA'"): global_pairwise_align(TabularMSA([DNA('ACGT')]), TabularMSA([RNA('ACGU')]), 1.0, 1.0, {})
def proteinAlign(seq1, seq2, gap_open_penalty, gap_extend_penalty, local=False): seq1 = seq1.upper() seq2 = seq2.upper() if local: aln, score, _ = local_pairwise_align(Protein(seq1), Protein(seq2), gap_open_penalty, gap_extend_penalty, blosum50) else: aln, score, _ = global_pairwise_align(Protein(seq1), Protein(seq2), gap_open_penalty, gap_extend_penalty, blosum50, penalize_terminal_gaps=True) response = { 'aln1': str(aln[0]), 'aln2': str(aln[1]), 'score': score, 'similarity': float('{:.2f}'.format(aln[0].match_frequency(aln[1], relative=True) * 100)) } return response
def pairwise_align_and_merge_sequences(self, input1, input2): output = [] if len(input1) > 0 and len(input2) > 0: sub_matrix = ska.make_identity_substitution_matrix( 4, -2, alphabet=self.SignSequence.definite_chars ) # match_score, mismatch_score x = self.SignSequence(input1.replace(' ', '*')) y = self.SignSequence(input2.replace(' ', '*')) alignment, score, start_end_positions = ska.global_pairwise_align( x, y, 4, 1, substitution_matrix=sub_matrix) # print('score %d' % score) # print(str(alignment[0])) # print(str(alignment[1])) output = self.merge_sequences(str(alignment[0]), str(alignment[1])) elif len(input1) > 0 and len(input2) == 0: output = [] for char in input1: output.append([char]) elif len(input2) > 0 and len(input1) == 0: output = [] for char in input2: output.append([char]) else: output.append([""]) return output
def build_compact_global_mask(refs): masks = build_masks(refs) alignment = global_pairwise_align(CustomSequence(masks[0]), CustomSequence(masks[1]), gap_open_penalty=1, gap_extend_penalty=0, substitution_matrix=subst_matrix) print(alignment) global_mask = '^(?i)(' + '|'.join( masks ) + ')|(_Hearing_IS|_Complaint_IS|_Settlement_IS|_Verdict_IS|_Withdrawal_IS)\\?$' return global_mask, alignment
def custom_align(target, cluster): hashes = generate_hash(target, cluster) coded_target = code_seq(target, hashes) coded_cluster = code_seq(cluster, hashes) print('Coded cluster: ', coded_cluster) class CustomSequence(GrammaredSequence): @classproperty def degenerate_map(cls): return {} @classproperty def definite_chars(cls): return set([hashes[k] for k in hashes]) @classproperty def default_gap_char(cls): return '-' @classproperty def gap_chars(cls): return set('-.') target_obj = CustomSequence(coded_target) cluster_obj = CustomSequence(coded_cluster) substitution_matrix = make_identity_substitution_matrix( match_score=1, mismatch_score=-1, alphabet=letters ) alignment = global_pairwise_align( target_obj, cluster_obj, gap_open_penalty=1, gap_extend_penalty=1, substitution_matrix=substitution_matrix ) return alignment
def test_global_pairwise_align_custom_alphabet_nondegenerate_chars(self): custom_substitution_matrix = make_identity_substitution_matrix( 1, -1, alphabet=CustomSequence.nondegenerate_chars) custom_msa, custom_score, custom_start_end = global_pairwise_align( CustomSequence("WXYZ"), CustomSequence("WXYYZZ"), 10.0, 5.0, custom_substitution_matrix) # Expected values computed by running an equivalent alignment using the # DNA alphabet with the following mapping: # # W X Y Z # | | | | # A C G T # self.assertEqual(custom_msa, TabularMSA([CustomSequence('WXYZ^^'), CustomSequence('WXYYZZ')])) self.assertEqual(custom_score, 2.0) self.assertEqual(custom_start_end, [(0, 3), (0, 5)])
def test_global_pairwise_align_invalid_type(self): with self.assertRaisesRegex(TypeError, "GrammaredSequence.*" "TabularMSA.*'Sequence'"): global_pairwise_align(DNA('ACGT'), Sequence('ACGT'), 1.0, 1.0, {})
def test_global_pairwise_align_invalid_type(self): with six.assertRaisesRegex(self, TypeError, "IUPACSequence.*TabularMSA.*'Sequence'"): global_pairwise_align(DNA('ACGT'), Sequence('ACGT'), 1.0, 1.0, {})
def msa_alignment_skbio(text_1, text_2, text_3): from skbio.alignment import global_pairwise_align, make_identity_substitution_matrix from multi_sequence_alignment.scikit_custom_sequence_ocr import CustomSequence # gap_open_penalty = 1 gap_extend_penalty = 1 # substitution_matrix_b50 =CustomSequence.blosum50 # just an example try: """ alignment, score, start_end_positions = local_pairwise_align_ssw( DNA("ACTAAGGCTCTCTACCCCTCTCAGAGA"), DNA("ACTAAGGCTCCTAACCCCCTTTTCTCAGA") ) """ # todo sequence/_sequence.py is missing proper encoding, this has to fix to make this work completely atm workaround: replace non ascii with '?' # also sequence/_grammared_sequence.py # cs1 = CustomSequence("Hallo das ist ein Test überkrass") # cs2 = CustomSequence("H4llo das ist Test überkraass") cs1 = CustomSequence(text_1) cs2 = CustomSequence(text_2) cs3 = CustomSequence(text_3) # substitution_matrix_unity = cs2.create_unity_sequence_matrix() substitution_matrix_equal = make_identity_substitution_matrix( 1, -1, cs2.create_charset_string()) # alignment, score, start_end_positions = local_pairwise_align(cs1, cs2, gap_open_penalty, gap_extend_penalty, substitution_matrix_unity) alignment12, score12, start_end_positions12 = global_pairwise_align( cs1, cs2, gap_open_penalty, gap_extend_penalty, substitution_matrix_equal) alignment23, score23, start_end_positions23 = global_pairwise_align( cs2, cs3, gap_open_penalty, gap_extend_penalty, substitution_matrix_equal) # alignment3, score3, start_end_positions3 = global_pairwise_align("Hallo das ist ein Test", "H4llo das ist Test", gap_open_penalty, gap_extend_penalty, substitution_matrix_equal) # res_one_1, res_two_1 = MsaHandler.compare(list_one, list_two) # res_two_2, res_three_2 = MsaHandler.compare(list_two, list_three) res_one_1 = str(alignment12._seqs[0]) res_two_1 = str(alignment12._seqs[1]) res_two_2 = str(alignment23._seqs[0]) res_three_2 = str(alignment23._seqs[1]) list_res_one_1 = list(res_one_1) list_res_two_1 = list(res_two_1) list_res_two_2 = list(res_two_2) list_res_three_2 = list(res_three_2) list_pivot_msa = None pivot_msa = None if len(list_res_two_1) >= len(list_res_two_2): # if len(list_res_two_1) > len(list_res_two_2): list_pivot_msa = list_res_two_1 pivot_msa = res_two_1 else: list_pivot_msa = list_res_two_2 pivot_msa = res_two_2 print(len(res_one_1), res_one_1) print(len(pivot_msa), pivot_msa) print(len(res_three_2), res_three_2) # if res_one_1.__contains__("Sitz:") is True: # print("asd") res_one_1_filled = "test" #MsaHandler.fillup_wildcarded_result(res_one_1, pivot_msa, '@') res_three_2_filled = "test" # MsaHandler.fillup_wildcarded_result(res_three_2, pivot_msa, '@') res_final_1 = res_one_1_filled res_final_2 = pivot_msa res_final_3 = res_three_2 #res_final_3 = res_three_2_filled return res_final_1, res_final_2, res_final_3 except Exception as ex: tr = inspect.trace() print("Exception raised in %s" % tr[-1][3])