Exemple #1
0
    def test_global_pairwise_align_nucleotide_penalize_terminal_gaps(self):
        # in these tests one sequence is about 3x the length of the other.
        # we toggle penalize_terminal_gaps to confirm that it results in
        # different alignments and alignment scores.
        seq1 = DNA("ACCGTGGACCGTTAGGATTGGACCCAAGGTTG")
        seq2 = DNA("T"*25 + "ACCGTGGACCGTAGGATTGGACCAAGGTTA" + "A"*25)

        obs_msa, obs_score, obs_start_end = global_pairwise_align_nucleotide(
            seq1, seq2, gap_open_penalty=5., gap_extend_penalty=0.5,
            match_score=5, mismatch_score=-4, penalize_terminal_gaps=False)

        self.assertEqual(
            obs_msa,
            TabularMSA([DNA("-------------------------ACCGTGGACCGTTAGGA"
                            "TTGGACCCAAGGTTG-------------------------"),
                        DNA("TTTTTTTTTTTTTTTTTTTTTTTTTACCGTGGACCGT-AGGA"
                            "TTGGACC-AAGGTTAAAAAAAAAAAAAAAAAAAAAAAAAA")]))
        self.assertEqual(obs_score, 131.0)

        obs_msa, obs_score, obs_start_end = global_pairwise_align_nucleotide(
            seq1, seq2, gap_open_penalty=5., gap_extend_penalty=0.5,
            match_score=5, mismatch_score=-4, penalize_terminal_gaps=True)

        self.assertEqual(
            obs_msa,
            TabularMSA([DNA("-------------------------ACCGTGGACCGTTAGGA"
                            "TTGGACCCAAGGTT-------------------------G"),
                        DNA("TTTTTTTTTTTTTTTTTTTTTTTTTACCGTGGACCGT-AGGA"
                            "TTGGACC-AAGGTTAAAAAAAAAAAAAAAAAAAAAAAAAA")]))
        self.assertEqual(obs_score, 97.0)
    def test_global_pairwise_align_nucleotide_penalize_terminal_gaps(self):
        # in these tests one sequence is about 3x the length of the other.
        # we toggle penalize_terminal_gaps to confirm that it results in
        # different alignments and alignment scores.
        seq1 = "ACCGTGGACCGTTAGGATTGGACCCAAGGTTG"
        seq2 = "T"*25 + "ACCGTGGACCGTAGGATTGGACCAAGGTTA" + "A"*25

        aln1 = ("-------------------------ACCGTGGACCGTTAGGA"
                "TTGGACCCAAGGTTG-------------------------")
        aln2 = ("TTTTTTTTTTTTTTTTTTTTTTTTTACCGTGGACCGT-AGGA"
                "TTGGACC-AAGGTTAAAAAAAAAAAAAAAAAAAAAAAAAA")
        expected = (aln1, aln2, 131.0)
        with warnings.catch_warnings():
            warnings.simplefilter("ignore")
            actual = global_pairwise_align_nucleotide(
                seq1, seq2, gap_open_penalty=5., gap_extend_penalty=0.5,
                match_score=5, mismatch_score=-4, penalize_terminal_gaps=False)
        self.assertEqual(str(actual[0]), expected[0])
        self.assertEqual(str(actual[1]), expected[1])
        self.assertEqual(actual.score(), expected[2])

        aln1 = ("-------------------------ACCGTGGACCGTTAGGA"
                "TTGGACCCAAGGTT-------------------------G")
        aln2 = ("TTTTTTTTTTTTTTTTTTTTTTTTTACCGTGGACCGT-AGGA"
                "TTGGACC-AAGGTTAAAAAAAAAAAAAAAAAAAAAAAAAA")
        expected = (aln1, aln2, 97.0)
        with warnings.catch_warnings():
            warnings.simplefilter("ignore")
            actual = global_pairwise_align_nucleotide(
                seq1, seq2, gap_open_penalty=5., gap_extend_penalty=0.5,
                match_score=5, mismatch_score=-4, penalize_terminal_gaps=True)
        self.assertEqual(str(actual[0]), expected[0])
        self.assertEqual(str(actual[1]), expected[1])
        self.assertEqual(actual.score(), expected[2])
Exemple #3
0
    def test_global_pairwise_align_nucleotide_penalize_terminal_gaps(self):
        # in these tests one sequence is about 3x the length of the other.
        # we toggle penalize_terminal_gaps to confirm that it results in
        # different alignments and alignment scores.
        seq1 = "ACCGTGGACCGTTAGGATTGGACCCAAGGTTG"
        seq2 = "T"*25 + "ACCGTGGACCGTAGGATTGGACCAAGGTTA" + "A"*25

        aln1 = ("-------------------------ACCGTGGACCGTTAGGA"
                "TTGGACCCAAGGTTG-------------------------")
        aln2 = ("TTTTTTTTTTTTTTTTTTTTTTTTTACCGTGGACCGT-AGGA"
                "TTGGACC-AAGGTTAAAAAAAAAAAAAAAAAAAAAAAAAA")
        expected = (aln1, aln2, 131.0)
        actual = global_pairwise_align_nucleotide(
            seq1, seq2, gap_open_penalty=5., gap_extend_penalty=0.5,
            match_score=5, mismatch_score=-4, penalize_terminal_gaps=False)
        self.assertEqual(str(actual[0]), expected[0])
        self.assertEqual(str(actual[1]), expected[1])
        self.assertEqual(actual.score(), expected[2])

        aln1 = ("-------------------------ACCGTGGACCGTTAGGA"
                "TTGGACCCAAGGTT-------------------------G")
        aln2 = ("TTTTTTTTTTTTTTTTTTTTTTTTTACCGTGGACCGT-AGGA"
                "TTGGACC-AAGGTTAAAAAAAAAAAAAAAAAAAAAAAAAA")
        expected = (aln1, aln2, 97.0)
        actual = global_pairwise_align_nucleotide(
            seq1, seq2, gap_open_penalty=5., gap_extend_penalty=0.5,
            match_score=5, mismatch_score=-4, penalize_terminal_gaps=True)
        self.assertEqual(str(actual[0]), expected[0])
        self.assertEqual(str(actual[1]), expected[1])
        self.assertEqual(actual.score(), expected[2])
Exemple #4
0
    def test_global_pairwise_align_nucleotide(self):
        obs_msa, obs_score, obs_start_end = global_pairwise_align_nucleotide(
            DNA("GACCTTGACCAGGTACC"), DNA("GAACTTTGACGTAAC"),
            gap_open_penalty=5., gap_extend_penalty=0.5, match_score=5,
            mismatch_score=-4)

        self.assertEqual(obs_msa, TabularMSA([DNA("G-ACCTTGACCAGGTACC"),
                                              DNA("GAACTTTGAC---GTAAC")]))
        self.assertEqual(obs_score, 41.0)
        self.assertEqual(obs_start_end, [(0, 16), (0, 14)])

        obs_msa, obs_score, obs_start_end = global_pairwise_align_nucleotide(
            DNA("GACCTTGACCAGGTACC"), DNA("GAACTTTGACGTAAC"),
            gap_open_penalty=10., gap_extend_penalty=0.5, match_score=5,
            mismatch_score=-4)

        self.assertEqual(obs_msa, TabularMSA([DNA("-GACCTTGACCAGGTACC"),
                                              DNA("GAACTTTGAC---GTAAC")]))
        self.assertEqual(obs_score, 32.0)
        self.assertEqual(obs_start_end, [(0, 16), (0, 14)])

        # DNA sequences with metadata
        obs_msa, obs_score, obs_start_end = global_pairwise_align_nucleotide(
            DNA("GACCTTGACCAGGTACC", metadata={'id': "s1"}),
            DNA("GAACTTTGACGTAAC", metadata={'id': "s2"}),
            gap_open_penalty=10., gap_extend_penalty=0.5, match_score=5,
            mismatch_score=-4)

        self.assertEqual(
            obs_msa,
            TabularMSA([DNA("-GACCTTGACCAGGTACC", metadata={'id': "s1"}),
                        DNA("GAACTTTGAC---GTAAC", metadata={'id': "s2"})]))

        self.assertEqual(obs_score, 32.0)
        self.assertEqual(obs_start_end, [(0, 16), (0, 14)])

        # Align one DNA sequence and one TabularMSA, score computed manually
        obs_msa, obs_score, obs_start_end = global_pairwise_align_nucleotide(
            TabularMSA([DNA("GACCTTGACCAGGTACC", metadata={'id': "s1"}),
                        DNA("GACCATGACCAGGTACC", metadata={'id': "s2"})]),
            DNA("GAACTTTGACGTAAC", metadata={'id': "s3"}),
            gap_open_penalty=10., gap_extend_penalty=0.5, match_score=5,
            mismatch_score=-4)

        self.assertEqual(
            obs_msa,
            TabularMSA([DNA("-GACCTTGACCAGGTACC", metadata={'id': "s1"}),
                        DNA("-GACCATGACCAGGTACC", metadata={'id': "s2"}),
                        DNA("GAACTTTGAC---GTAAC", metadata={'id': "s3"})]))

        self.assertEqual(obs_score, 27.5)
        self.assertEqual(obs_start_end, [(0, 16), (0, 14)])

        # TypeError on invalid input
        self.assertRaises(TypeError, global_pairwise_align_nucleotide,
                          42, DNA("ACGT"))
        self.assertRaises(TypeError, global_pairwise_align_nucleotide,
                          DNA("ACGT"), 42)
Exemple #5
0
    def test_nucleotide_aligners_use_substitution_matrices(self):
        alt_sub = make_identity_substitution_matrix(10, -10)
        # alternate substitution matrix yields different alignment (the
        # aligned sequences and the scores are different) with local alignment
        actual_no_sub = local_pairwise_align_nucleotide(
            "GACCTTGACCAGGTACC", "GAACTTTGACGTAAC", gap_open_penalty=10.,
            gap_extend_penalty=5., match_score=5, mismatch_score=-4)
        actual_alt_sub = local_pairwise_align_nucleotide(
            "GACCTTGACCAGGTACC", "GAACTTTGACGTAAC", gap_open_penalty=10.,
            gap_extend_penalty=5., match_score=5, mismatch_score=-4,
            substitution_matrix=alt_sub)
        self.assertNotEqual(str(actual_no_sub[0]), str(actual_alt_sub[0]))
        self.assertNotEqual(str(actual_no_sub[1]), str(actual_alt_sub[1]))
        self.assertNotEqual(actual_no_sub.score(),
                            actual_alt_sub.score())

        # alternate substitution matrix yields different alignment (the
        # aligned sequences and the scores are different) with global alignment
        actual_no_sub = local_pairwise_align_nucleotide(
            "GACCTTGACCAGGTACC", "GAACTTTGACGTAAC", gap_open_penalty=10.,
            gap_extend_penalty=5., match_score=5, mismatch_score=-4)
        actual_alt_sub = global_pairwise_align_nucleotide(
            "GACCTTGACCAGGTACC", "GAACTTTGACGTAAC", gap_open_penalty=10.,
            gap_extend_penalty=5., match_score=5, mismatch_score=-4,
            substitution_matrix=alt_sub)
        self.assertNotEqual(str(actual_no_sub[0]), str(actual_alt_sub[0]))
        self.assertNotEqual(str(actual_no_sub[1]), str(actual_alt_sub[1]))
        self.assertNotEqual(actual_no_sub.score(),
                            actual_alt_sub.score())
    def test_nucleotide_aligners_use_substitution_matrices(self):
        alt_sub = make_identity_substitution_matrix(10, -10)
        # alternate substitution matrix yields different alignment (the
        # aligned sequences and the scores are different) with local alignment
        with warnings.catch_warnings():
            warnings.simplefilter("ignore")
            actual_no_sub = local_pairwise_align_nucleotide(
                "GACCTTGACCAGGTACC", "GAACTTTGACGTAAC", gap_open_penalty=10.,
                gap_extend_penalty=5., match_score=5, mismatch_score=-4)
            actual_alt_sub = local_pairwise_align_nucleotide(
                "GACCTTGACCAGGTACC", "GAACTTTGACGTAAC", gap_open_penalty=10.,
                gap_extend_penalty=5., match_score=5, mismatch_score=-4,
                substitution_matrix=alt_sub)
        self.assertNotEqual(str(actual_no_sub[0]), str(actual_alt_sub[0]))
        self.assertNotEqual(str(actual_no_sub[1]), str(actual_alt_sub[1]))
        self.assertNotEqual(actual_no_sub.score(),
                            actual_alt_sub.score())

        # alternate substitution matrix yields different alignment (the
        # aligned sequences and the scores are different) with global alignment
        with warnings.catch_warnings():
            warnings.simplefilter("ignore")
            actual_no_sub = local_pairwise_align_nucleotide(
                "GACCTTGACCAGGTACC", "GAACTTTGACGTAAC", gap_open_penalty=10.,
                gap_extend_penalty=5., match_score=5, mismatch_score=-4)
            actual_alt_sub = global_pairwise_align_nucleotide(
                "GACCTTGACCAGGTACC", "GAACTTTGACGTAAC", gap_open_penalty=10.,
                gap_extend_penalty=5., match_score=5, mismatch_score=-4,
                substitution_matrix=alt_sub)
        self.assertNotEqual(str(actual_no_sub[0]), str(actual_alt_sub[0]))
        self.assertNotEqual(str(actual_no_sub[1]), str(actual_alt_sub[1]))
        self.assertNotEqual(actual_no_sub.score(),
                            actual_alt_sub.score())
Exemple #7
0
def global_alignments(ref, q):

    s1 = DNA(ref)
    s2 = DNA(q)
    alignment, score, start_end_positions = global_pairwise_align_nucleotide(
        s1, s2, match_score=4, mismatch_score=1)
    return alignments_to_cigar(alignment[0]._string.decode("utf-8"),
                               alignment[1]._string.decode("utf-8"))
Exemple #8
0
    def test_nucleotide_aligners_use_substitution_matrices(self):
        alt_sub = make_identity_substitution_matrix(10, -10)
        # alternate substitution matrix yields different alignment (the
        # aligned sequences and the scores are different) with local alignment
        msa_no_sub, score_no_sub, start_end_no_sub = \
            local_pairwise_align_nucleotide(
                DNA("GACCTTGACCAGGTACC"), DNA("GAACTTTGACGTAAC"),
                gap_open_penalty=10., gap_extend_penalty=5., match_score=5,
                mismatch_score=-4)

        msa_alt_sub, score_alt_sub, start_end_alt_sub = \
            local_pairwise_align_nucleotide(
                DNA("GACCTTGACCAGGTACC"), DNA("GAACTTTGACGTAAC"),
                gap_open_penalty=10., gap_extend_penalty=5., match_score=5,
                mismatch_score=-4, substitution_matrix=alt_sub)

        self.assertNotEqual(msa_no_sub, msa_alt_sub)
        self.assertNotEqual(score_no_sub, score_alt_sub)
        self.assertNotEqual(start_end_no_sub, start_end_alt_sub)

        # alternate substitution matrix yields different alignment (the
        # aligned sequences and the scores are different) with global alignment
        msa_no_sub, score_no_sub, start_end_no_sub = \
            global_pairwise_align_nucleotide(
                DNA("GACCTTGACCAGGTACC"), DNA("GAACTTTGACGTAAC"),
                gap_open_penalty=10., gap_extend_penalty=5., match_score=5,
                mismatch_score=-4)

        msa_alt_sub, score_alt_sub, start_end_alt_sub = \
            global_pairwise_align_nucleotide(
                DNA("GACCTTGACCAGGTACC"), DNA("GAACTTTGACGTAAC"),
                gap_open_penalty=10., gap_extend_penalty=5., match_score=5,
                mismatch_score=-4, substitution_matrix=alt_sub)

        self.assertNotEqual(msa_no_sub, msa_alt_sub)
        self.assertNotEqual(score_no_sub, score_alt_sub)
        self.assertEqual(start_end_no_sub, start_end_alt_sub)
Exemple #9
0
def global_align(seq1_1hot, seq2_1hot):
    """Align two 1-hot encoded sequences."""

    align_opts = {
        'gap_open_penalty': 10,
        'gap_extend_penalty': 1,
        'match_score': 5,
        'mismatch_score': -4
    }

    seq1_dna = DNA(dna_io.hot1_dna(seq1_1hot))
    seq2_dna = DNA(dna_io.hot1_dna(seq2_1hot))
    # seq_align = global_pairwise_align_nucleotide(seq1_dna, seq2_dna, *align_opts)[0]
    seq_align = global_pairwise_align_nucleotide(seq1_dna,
                                                 seq2_dna,
                                                 gap_open_penalty=10,
                                                 gap_extend_penalty=1,
                                                 match_score=5,
                                                 mismatch_score=-4)[0]
    seq1_align = str(seq_align[0])
    seq2_align = str(seq_align[1])
    return seq1_align, seq2_align
Exemple #10
0
def dnaAlign(seq1, seq2, gap_open_penalty, gap_extend_penalty, local=False):
    seq1 = seq1.upper()
    seq2 = seq2.upper()

    if local:
        aln, score, _ = local_pairwise_align_nucleotide(
            DNA(seq1), DNA(seq2), gap_open_penalty, gap_extend_penalty)
    else:
        aln, score, _ = global_pairwise_align_nucleotide(
            DNA(seq1), DNA(seq2), gap_open_penalty, gap_extend_penalty)

    response = {
        'aln1':
        str(aln[0]),
        'aln2':
        str(aln[1]),
        'score':
        score,
        'similarity':
        float('{:.2f}'.format(aln[0].match_frequency(aln[1], relative=True) *
                              100))
    }

    return response
Exemple #11
0
 def test_global_pairwise_align_nucleotide_invalid_dtype(self):
     with self.assertRaisesRegex(TypeError,
                                 "TabularMSA with DNA or RNA dtype.*dtype "
                                 "'Protein'"):
         global_pairwise_align_nucleotide(TabularMSA([DNA('ACGT')]),
                                          TabularMSA([Protein('PAW')]))
    def test_global_pairwise_align_nucleotide(self):
        expected = ("G-ACCTTGACCAGGTACC", "GAACTTTGAC---GTAAC", 41.0, 0, 0)
        actual = global_pairwise_align_nucleotide("GACCTTGACCAGGTACC",
                                                  "GAACTTTGACGTAAC",
                                                  gap_open_penalty=5.,
                                                  gap_extend_penalty=0.5,
                                                  match_score=5,
                                                  mismatch_score=-4)
        self.assertEqual(str(actual[0]), expected[0])
        self.assertEqual(str(actual[1]), expected[1])
        self.assertEqual(actual.score(), expected[2])
        self.assertEqual(actual.start_end_positions(), [(0, 16), (0, 14)])
        self.assertEqual(actual.ids(), list('01'))

        expected = ("-GACCTTGACCAGGTACC", "GAACTTTGAC---GTAAC", 32.0, 0, 0)
        actual = global_pairwise_align_nucleotide("GACCTTGACCAGGTACC",
                                                  "GAACTTTGACGTAAC",
                                                  gap_open_penalty=10.,
                                                  gap_extend_penalty=0.5,
                                                  match_score=5,
                                                  mismatch_score=-4)
        self.assertEqual(str(actual[0]), expected[0])
        self.assertEqual(str(actual[1]), expected[1])
        self.assertEqual(actual.score(), expected[2])
        self.assertEqual(actual.start_end_positions(), [(0, 16), (0, 14)])
        self.assertEqual(actual.ids(), list('01'))

        # DNA (rather than str) as input
        expected = ("-GACCTTGACCAGGTACC", "GAACTTTGAC---GTAAC", 32.0, 0, 0)
        actual = global_pairwise_align_nucleotide(DNA("GACCTTGACCAGGTACC",
                                                      "s1"),
                                                  DNA("GAACTTTGACGTAAC", "s2"),
                                                  gap_open_penalty=10.,
                                                  gap_extend_penalty=0.5,
                                                  match_score=5,
                                                  mismatch_score=-4)
        self.assertEqual(str(actual[0]), expected[0])
        self.assertEqual(str(actual[1]), expected[1])
        self.assertEqual(actual.score(), expected[2])
        self.assertEqual(actual.start_end_positions(), [(0, 16), (0, 14)])
        self.assertEqual(actual.ids(), ["s1", "s2"])

        # Align one DNA sequence and one Alignment, score computed manually
        expected = ("-GACCTTGACCAGGTACC", "-GACCATGACCAGGTACC",
                    "GAACTTTGAC---GTAAC", 27.5, 0, 0)
        actual = global_pairwise_align_nucleotide(Alignment(
            [DNA("GACCTTGACCAGGTACC", "s1"),
             DNA("GACCATGACCAGGTACC", "s2")]),
                                                  DNA("GAACTTTGACGTAAC", "s3"),
                                                  gap_open_penalty=10.,
                                                  gap_extend_penalty=0.5,
                                                  match_score=5,
                                                  mismatch_score=-4)
        self.assertEqual(str(actual[0]), expected[0])
        self.assertEqual(str(actual[1]), expected[1])
        self.assertEqual(str(actual[2]), expected[2])
        self.assertEqual(actual.score(), expected[3])
        self.assertEqual(actual.start_end_positions(), [(0, 16), (0, 14)])
        self.assertEqual(actual.ids(), ["s1", "s2", "s3"])

        # ids are provided if they're not passed in
        actual = global_pairwise_align_nucleotide(DNA("GACCTTGACCAGGTACC"),
                                                  DNA("GAACTTTGACGTAAC"),
                                                  gap_open_penalty=10.,
                                                  gap_extend_penalty=0.5,
                                                  match_score=5,
                                                  mismatch_score=-4)
        self.assertEqual(actual.ids(), list('01'))

        # TypeError on invalid input
        self.assertRaises(TypeError, global_pairwise_align_nucleotide, 42,
                          "HEAGAWGHEE")
        self.assertRaises(TypeError, global_pairwise_align_nucleotide,
                          "HEAGAWGHEE", 42)
Exemple #13
0
    def test_global_pairwise_align_nucleotide(self):
        expected = ("G-ACCTTGACCAGGTACC", "GAACTTTGAC---GTAAC", 41.0, 0, 0)
        actual = global_pairwise_align_nucleotide(
            "GACCTTGACCAGGTACC", "GAACTTTGACGTAAC", gap_open_penalty=5.,
            gap_extend_penalty=0.5, match_score=5, mismatch_score=-4)
        self.assertEqual(str(actual[0]), expected[0])
        self.assertEqual(str(actual[1]), expected[1])
        self.assertEqual(actual.score(), expected[2])
        self.assertEqual(actual.start_end_positions(), [(0, 16), (0, 14)])
        self.assertEqual(actual.ids(), list('01'))

        expected = ("-GACCTTGACCAGGTACC", "GAACTTTGAC---GTAAC", 32.0, 0, 0)
        actual = global_pairwise_align_nucleotide(
            "GACCTTGACCAGGTACC", "GAACTTTGACGTAAC", gap_open_penalty=10.,
            gap_extend_penalty=0.5, match_score=5, mismatch_score=-4)
        self.assertEqual(str(actual[0]), expected[0])
        self.assertEqual(str(actual[1]), expected[1])
        self.assertEqual(actual.score(), expected[2])
        self.assertEqual(actual.start_end_positions(), [(0, 16), (0, 14)])
        self.assertEqual(actual.ids(), list('01'))

        # DNA (rather than str) as input
        expected = ("-GACCTTGACCAGGTACC", "GAACTTTGAC---GTAAC", 32.0, 0, 0)
        actual = global_pairwise_align_nucleotide(
            DNA("GACCTTGACCAGGTACC", metadata={'id': "s1"}),
            DNA("GAACTTTGACGTAAC", metadata={'id': "s2"}),
            gap_open_penalty=10., gap_extend_penalty=0.5, match_score=5,
            mismatch_score=-4)
        self.assertEqual(str(actual[0]), expected[0])
        self.assertEqual(str(actual[1]), expected[1])
        self.assertEqual(actual.score(), expected[2])
        self.assertEqual(actual.start_end_positions(), [(0, 16), (0, 14)])
        self.assertEqual(actual.ids(), ["s1", "s2"])

        # Align one DNA sequence and one Alignment, score computed manually
        expected = ("-GACCTTGACCAGGTACC", "-GACCATGACCAGGTACC",
                    "GAACTTTGAC---GTAAC", 27.5, 0, 0)
        actual = global_pairwise_align_nucleotide(
            Alignment([DNA("GACCTTGACCAGGTACC", metadata={'id': "s1"}),
                       DNA("GACCATGACCAGGTACC", metadata={'id': "s2"})]),
            DNA("GAACTTTGACGTAAC", metadata={'id': "s3"}),
            gap_open_penalty=10., gap_extend_penalty=0.5, match_score=5,
            mismatch_score=-4)
        self.assertEqual(str(actual[0]), expected[0])
        self.assertEqual(str(actual[1]), expected[1])
        self.assertEqual(str(actual[2]), expected[2])
        self.assertEqual(actual.score(), expected[3])
        self.assertEqual(actual.start_end_positions(), [(0, 16), (0, 14)])
        self.assertEqual(actual.ids(), ["s1", "s2", "s3"])

        # ids are provided if they're not passed in
        actual = global_pairwise_align_nucleotide(
            DNA("GACCTTGACCAGGTACC"),
            DNA("GAACTTTGACGTAAC"),
            gap_open_penalty=10., gap_extend_penalty=0.5, match_score=5,
            mismatch_score=-4)
        self.assertEqual(actual.ids(), list('01'))

        # TypeError on invalid input
        self.assertRaises(TypeError, global_pairwise_align_nucleotide,
                          42, "HEAGAWGHEE")
        self.assertRaises(TypeError, global_pairwise_align_nucleotide,
                          "HEAGAWGHEE", 42)
def pairwise_similarity(seq, query):
    alignment = global_pairwise_align_nucleotide(seq, query)
    return alignment[0].fraction_same(alignment[1])
Exemple #15
0
def get_meth_profile(args, seg_chrom, seg_start, seg_end, seg_name,
                     seg_strand):
    logger.info('profiling %s %s:%d-%d:%s' %
                (seg_name, seg_chrom, seg_start, seg_end, seg_strand))

    te_ref_seq = single_seq_fa(args.teref)
    ref = pysam.Fastafile(args.ref)

    meth_tbx = pysam.Tabixfile(args.meth)

    tmp_methdata = str(uuid4()) + '.tmp.methdata.tsv'

    with open(tmp_methdata, 'w') as meth_out:
        # header
        with gzip.open(args.meth, 'rt') as _:
            for line in _:
                assert line.startswith('chromosome')
                meth_out.write(line)
                break

        assert seg_chrom in meth_tbx.contigs

        for rec in meth_tbx.fetch(seg_chrom, seg_start, seg_end):
            meth_out.write(str(rec) + '\n')

    # index by read_name
    methdata = pd.read_csv(tmp_methdata, sep='\t', header=0, index_col=4)

    os.remove(tmp_methdata)

    reads = []
    if args.excl_ambig:
        reads = exclude_ambiguous_reads(args.bam, seg_chrom, seg_start,
                                        seg_end)
    else:
        reads = get_reads(args.bam, seg_chrom, seg_start, seg_end)

    reads = list(set(reads).intersection(set(methdata.index)))

    methdata = methdata.loc[reads]

    seg_reads = {}

    for index, row in methdata.iterrows():
        r_start = row['start']
        r_end = row['end']
        llr = row['log_lik_ratio']
        seq = row['sequence']

        # get per-CG position (nanopolish/calculate_methylation_frequency.py)
        cg_pos = seq.find("CG")
        first_cg_pos = cg_pos

        while cg_pos != -1:
            cg_start = r_start + cg_pos - first_cg_pos
            cg_pos = seq.find("CG", cg_pos + 1)

            cg_seg_start = cg_start - seg_start

            if cg_start >= seg_start and cg_start <= seg_end:
                if index not in seg_reads:
                    seg_reads[index] = Read(index, cg_seg_start, llr)
                else:
                    seg_reads[index].add_cpg(cg_seg_start, llr)

    meth_table = dd(dict)
    sample = '.'.join(args.bam.split('.')[:-1])

    for name, read in seg_reads.items():
        for loc in read.llrs.keys():
            uuid = str(uuid4())
            meth_table[uuid]['loc'] = loc
            meth_table[uuid]['llr'] = read.llrs[loc]
            meth_table[uuid]['read'] = name
            meth_table[uuid]['sample'] = sample
            meth_table[uuid]['call'] = read.meth_calls[loc]

    meth_table = pd.DataFrame.from_dict(meth_table).T
    meth_table['loc'] = pd.to_numeric(meth_table['loc'])
    meth_table['llr'] = pd.to_numeric(meth_table['llr'])

    meth_table['orig_loc'] = meth_table['loc']
    meth_table['loc'] = ss.rankdata(meth_table['loc'], method='dense')

    coord_to_cpg = {}
    cpg_to_coord = {}
    for orig_loc, new_loc in zip(meth_table['orig_loc'], meth_table['loc']):
        coord_to_cpg[orig_loc] = new_loc
        cpg_to_coord[new_loc] = orig_loc

    windowed_methfrac, meth_n = slide_window(meth_table,
                                             sample,
                                             width=int(args.slidingwindowsize),
                                             slide=int(args.slidingwindowstep))

    if len(windowed_methfrac) <= int(args.smoothwindowsize):
        logger.warning('too few sites after windowing: %s:%d-%d' %
                       (seg_chrom, seg_start, seg_end))
        return [], []

    smoothed_methfrac = smooth(np.asarray(list(windowed_methfrac.values())),
                               window_len=int(args.smoothwindowsize))

    coord_meth_pos = []

    cpg_meth_pos = list(windowed_methfrac.keys())

    for cpg in cpg_meth_pos:
        if seg_strand == '+':
            coord_meth_pos.append(cpg_to_coord[cpg])
        if seg_strand == '-':
            coord_meth_pos.append((seg_end - seg_start) - cpg_to_coord[cpg])

    # alignment to ref elt

    elt_seq = ref.fetch(seg_chrom, seg_start, seg_end)
    if seg_strand == '-':
        elt_seq = rc(elt_seq)

    te_ref_seq = te_ref_seq.upper()
    elt_seq = elt_seq.upper()

    s_ref = skseq.DNA(te_ref_seq)
    s_elt = skseq.DNA(elt_seq)

    aln_res = []

    try:
        if args.globalign:
            aln_res = skalign.global_pairwise_align_nucleotide(s_ref, s_elt)
        else:
            aln_res = skalign.local_pairwise_align_ssw(s_ref, s_elt)
    except IndexError:  # scikit-bio throws this if no bases align  >:|
        logger.warning('no align on seg: %s:%d-%d' %
                       (seg_chrom, seg_start, seg_end))
        return [], []

    coord_ref, coord_elt = aln_res[2]

    len_ref = coord_ref[1] - coord_ref[0]
    len_elt = coord_elt[1] - coord_elt[0]

    if len_ref / len(te_ref_seq) < float(args.lenfrac):
        logger.warning(
            'ref align too short on seg: %s:%d-%d (%f)' %
            (seg_chrom, seg_start, seg_end, len_ref / len(te_ref_seq)))
        return [], []

    if len_elt / len(elt_seq) < float(args.lenfrac):
        logger.warning('elt align too short on seg: %s:%d-%d (%f)' %
                       (seg_chrom, seg_start, seg_end, len_elt / len(elt_seq)))
        return [], []

    tab_msa = aln_res[0]

    elt_to_ref_coords = {}

    pos_ref = coord_ref[0]
    pos_elt = coord_elt[0]

    for pos in tab_msa.iter_positions():
        pos = list(pos)
        b_ref = pos[0]
        b_elt = pos[1]

        if '-' not in pos:
            elt_to_ref_coords[pos_elt] = pos_ref
            pos_ref += 1
            pos_elt += 1

        if b_elt == '-':
            pos_ref += 1

        if b_ref == '-':
            elt_to_ref_coords[pos_elt] = 'na'
            pos_elt += 1

    revised_coord_meth_pos = []
    meth_profile = []

    for pos, meth in zip(coord_meth_pos, smoothed_methfrac):
        if pos not in elt_to_ref_coords:
            continue

        revised_pos = elt_to_ref_coords[pos]

        if revised_pos != 'na':
            revised_coord_meth_pos.append(revised_pos)
            meth_profile.append(meth)

    return revised_coord_meth_pos, meth_profile