Example #1
0
    def test_global_pairwise_align_dtype_mismatch(self):
        with self.assertRaisesRegex(TypeError, r"same dtype: 'DNA' != 'RNA'"):
            global_pairwise_align(DNA('ACGT'), TabularMSA([RNA('ACGU')]), 1.0,
                                  1.0, {})

        with self.assertRaisesRegex(TypeError, r"same dtype: 'DNA' != 'RNA'"):
            global_pairwise_align(TabularMSA([DNA('ACGT')]),
                                  TabularMSA([RNA('ACGU')]), 1.0, 1.0, {})
Example #2
0
    def test_global_pairwise_align_dtype_mismatch(self):
        with self.assertRaisesRegex(TypeError,
                                    "same dtype: 'DNA' != 'RNA'"):
            global_pairwise_align(DNA('ACGT'), TabularMSA([RNA('ACGU')]),
                                  1.0, 1.0, {})

        with self.assertRaisesRegex(TypeError,
                                    "same dtype: 'DNA' != 'RNA'"):
            global_pairwise_align(TabularMSA([DNA('ACGT')]),
                                  TabularMSA([RNA('ACGU')]),
                                  1.0, 1.0, {})
Example #3
0
def proteinAlign(seq1,
                 seq2,
                 gap_open_penalty,
                 gap_extend_penalty,
                 local=False):
    seq1 = seq1.upper()
    seq2 = seq2.upper()

    if local:
        aln, score, _ = local_pairwise_align(Protein(seq1), Protein(seq2),
                                             gap_open_penalty,
                                             gap_extend_penalty, blosum50)
    else:
        aln, score, _ = global_pairwise_align(Protein(seq1),
                                              Protein(seq2),
                                              gap_open_penalty,
                                              gap_extend_penalty,
                                              blosum50,
                                              penalize_terminal_gaps=True)

    response = {
        'aln1':
        str(aln[0]),
        'aln2':
        str(aln[1]),
        'score':
        score,
        'similarity':
        float('{:.2f}'.format(aln[0].match_frequency(aln[1], relative=True) *
                              100))
    }

    return response
    def pairwise_align_and_merge_sequences(self, input1, input2):
        output = []
        if len(input1) > 0 and len(input2) > 0:
            sub_matrix = ska.make_identity_substitution_matrix(
                4, -2, alphabet=self.SignSequence.definite_chars
            )  # match_score, mismatch_score
            x = self.SignSequence(input1.replace(' ', '*'))
            y = self.SignSequence(input2.replace(' ', '*'))
            alignment, score, start_end_positions = ska.global_pairwise_align(
                x, y, 4, 1, substitution_matrix=sub_matrix)
            # print('score %d' % score)
            # print(str(alignment[0]))
            # print(str(alignment[1]))
            output = self.merge_sequences(str(alignment[0]), str(alignment[1]))

        elif len(input1) > 0 and len(input2) == 0:
            output = []
            for char in input1:
                output.append([char])

        elif len(input2) > 0 and len(input1) == 0:
            output = []
            for char in input2:
                output.append([char])
        else:
            output.append([""])
        return output
Example #5
0
def build_compact_global_mask(refs):
    masks = build_masks(refs)
    alignment = global_pairwise_align(CustomSequence(masks[0]),
                                      CustomSequence(masks[1]),
                                      gap_open_penalty=1,
                                      gap_extend_penalty=0,
                                      substitution_matrix=subst_matrix)
    print(alignment)
    global_mask = '^(?i)(' + '|'.join(
        masks
    ) + ')|(_Hearing_IS|_Complaint_IS|_Settlement_IS|_Verdict_IS|_Withdrawal_IS)\\?$'
    return global_mask, alignment
Example #6
0
def custom_align(target, cluster):
    hashes = generate_hash(target, cluster)
    coded_target = code_seq(target, hashes)
    coded_cluster = code_seq(cluster, hashes)

    print('Coded cluster: ', coded_cluster)

    class CustomSequence(GrammaredSequence):
        @classproperty
        def degenerate_map(cls):
            return {}

        @classproperty
        def definite_chars(cls):
            return set([hashes[k] for k in hashes])


        @classproperty
        def default_gap_char(cls):
            return '-'

        @classproperty
        def gap_chars(cls):
             return set('-.')
         

    target_obj = CustomSequence(coded_target)
    cluster_obj = CustomSequence(coded_cluster)
    
    substitution_matrix = make_identity_substitution_matrix(
        match_score=1,
        mismatch_score=-1,
        alphabet=letters
    )
    
    
    alignment = global_pairwise_align(
            target_obj, 
            cluster_obj,
            gap_open_penalty=1,
            gap_extend_penalty=1,
            substitution_matrix=substitution_matrix
        )
    
    return alignment
Example #7
0
    def test_global_pairwise_align_custom_alphabet_nondegenerate_chars(self):
        custom_substitution_matrix = make_identity_substitution_matrix(
            1, -1, alphabet=CustomSequence.nondegenerate_chars)

        custom_msa, custom_score, custom_start_end = global_pairwise_align(
            CustomSequence("WXYZ"), CustomSequence("WXYYZZ"),
            10.0, 5.0, custom_substitution_matrix)

        # Expected values computed by running an equivalent alignment using the
        # DNA alphabet with the following mapping:
        #
        #     W X Y Z
        #     | | | |
        #     A C G T
        #
        self.assertEqual(custom_msa, TabularMSA([CustomSequence('WXYZ^^'),
                                                 CustomSequence('WXYYZZ')]))
        self.assertEqual(custom_score, 2.0)
        self.assertEqual(custom_start_end, [(0, 3), (0, 5)])
Example #8
0
 def test_global_pairwise_align_invalid_type(self):
     with self.assertRaisesRegex(TypeError,
                                 "GrammaredSequence.*"
                                 "TabularMSA.*'Sequence'"):
         global_pairwise_align(DNA('ACGT'), Sequence('ACGT'), 1.0, 1.0, {})
Example #9
0
 def test_global_pairwise_align_invalid_type(self):
     with six.assertRaisesRegex(self, TypeError,
                                "IUPACSequence.*TabularMSA.*'Sequence'"):
         global_pairwise_align(DNA('ACGT'), Sequence('ACGT'), 1.0, 1.0, {})
def msa_alignment_skbio(text_1, text_2, text_3):
    from skbio.alignment import global_pairwise_align, make_identity_substitution_matrix

    from multi_sequence_alignment.scikit_custom_sequence_ocr import CustomSequence

    #
    gap_open_penalty = 1
    gap_extend_penalty = 1
    # substitution_matrix_b50 =CustomSequence.blosum50 # just an example

    try:
        """    
        alignment, score, start_end_positions = local_pairwise_align_ssw(
            DNA("ACTAAGGCTCTCTACCCCTCTCAGAGA"),
            DNA("ACTAAGGCTCCTAACCCCCTTTTCTCAGA")
        )
        """
        # todo sequence/_sequence.py is missing proper encoding, this has to fix to make this work completely atm workaround: replace non ascii with '?'
        # also sequence/_grammared_sequence.py
        # cs1 = CustomSequence("Hallo das ist ein Test überkrass")
        # cs2 = CustomSequence("H4llo das ist Test überkraass")
        cs1 = CustomSequence(text_1)
        cs2 = CustomSequence(text_2)
        cs3 = CustomSequence(text_3)

        # substitution_matrix_unity = cs2.create_unity_sequence_matrix()
        substitution_matrix_equal = make_identity_substitution_matrix(
            1, -1, cs2.create_charset_string())

        # alignment, score, start_end_positions = local_pairwise_align(cs1, cs2, gap_open_penalty, gap_extend_penalty, substitution_matrix_unity)

        alignment12, score12, start_end_positions12 = global_pairwise_align(
            cs1, cs2, gap_open_penalty, gap_extend_penalty,
            substitution_matrix_equal)
        alignment23, score23, start_end_positions23 = global_pairwise_align(
            cs2, cs3, gap_open_penalty, gap_extend_penalty,
            substitution_matrix_equal)

        # alignment3, score3, start_end_positions3 = global_pairwise_align("Hallo das ist ein Test", "H4llo das ist Test", gap_open_penalty, gap_extend_penalty, substitution_matrix_equal)
        # res_one_1, res_two_1 = MsaHandler.compare(list_one, list_two)

        # res_two_2, res_three_2 = MsaHandler.compare(list_two, list_three)
        res_one_1 = str(alignment12._seqs[0])
        res_two_1 = str(alignment12._seqs[1])
        res_two_2 = str(alignment23._seqs[0])
        res_three_2 = str(alignment23._seqs[1])

        list_res_one_1 = list(res_one_1)
        list_res_two_1 = list(res_two_1)

        list_res_two_2 = list(res_two_2)
        list_res_three_2 = list(res_three_2)

        list_pivot_msa = None
        pivot_msa = None
        if len(list_res_two_1) >= len(list_res_two_2):
            # if len(list_res_two_1) > len(list_res_two_2):
            list_pivot_msa = list_res_two_1
            pivot_msa = res_two_1
        else:
            list_pivot_msa = list_res_two_2
            pivot_msa = res_two_2

        print(len(res_one_1), res_one_1)
        print(len(pivot_msa), pivot_msa)
        print(len(res_three_2), res_three_2)
        # if res_one_1.__contains__("Sitz:") is True:
        #    print("asd")

        res_one_1_filled = "test"  #MsaHandler.fillup_wildcarded_result(res_one_1, pivot_msa, '@')
        res_three_2_filled = "test"  # MsaHandler.fillup_wildcarded_result(res_three_2, pivot_msa, '@')

        res_final_1 = res_one_1_filled
        res_final_2 = pivot_msa
        res_final_3 = res_three_2
        #res_final_3 = res_three_2_filled
        return res_final_1, res_final_2, res_final_3

    except Exception as ex:
        tr = inspect.trace()
        print("Exception raised in %s" % tr[-1][3])