Example #1
0
def call(targetlist,
         querylist,
         match=1,
         mismatch=2,
         gapopen=5,
         gapextend=0,
         ksize=31):
    """
    Wrap the `kevlar call` procedure as a generator function.

    Input is the following.
    - an iterable containing one or more target sequences from the reference
      genome, stored as khmer or screed sequence records
    - an iterable containing one or more contigs assembled by kevlar, stored as
      khmer or screed sequence records
    - alignment match score (integer)
    - alignment mismatch penalty (integer)
    - alignment gap open penalty (integer)
    - alignment gap extension penalty (integer)

    The function yields tuples of target sequence name, query sequence name,
    and alignment CIGAR string
    """
    for target in sorted(targetlist, key=lambda record: record.name):
        for query in sorted(querylist, reverse=True, key=len):
            cigar = kevlar.align(target.sequence, query.sequence, match,
                                 mismatch, gapopen, gapextend)
            for varcall in make_call(target, query, cigar, ksize):
                yield varcall
Example #2
0
def test_nomargin():
    qfile = kevlar.open(data_file('nomargin-r-indel-contigs.augfasta'), 'r')
    tfile = kevlar.open(data_file('nomargin-r-gdna.fa'), 'r')
    query = next(kevlar.parse_augmented_fastx(qfile))
    target = next(kevlar.parse_augmented_fastx(tfile))
    cigar, score = kevlar.align(target.sequence, query.sequence)
    tok = AlignmentTokenizer(query.sequence, target.sequence, cigar)
    assert tok._cigar == tok._origcigar
Example #3
0
def test_align():
    """Smoke test for ksw2 aligner"""
    target = ('TAAATAAATATCTGGTGTTTGAGGCAAAAAGGCAGACTTAAATTCTAAATCACACCTGTGCTT'
              'CCAGCACTACCTTCAAGCGCAGGTTCGAGCCAGTCAGGCAGGGTACATAAGAGTCCATTGTGC'
              'CTGTATTATTTTGAGCAATGGCTAAAGTACCTTCACCCTTGCTCACTGCTCCCCCACTTCCTC'
              'AAGTCTCATCGTGTTTTTTTTAGAGCTAGTTTCTTAGTCTCATTAGGCTTCAGTCACCAT')
    query = ('TCTGGTGTTTGAGGCAAAAAGGCAGACTTAAATTCTAAATCACACCTGTGCTTCCAGCACTACC'
             'TTCAAGCGCAGGTTCGAGCCAGTCAGGACTGCTCCCCCACTTCCTCAAGTCTCATCGTGTTTTT'
             'TTTAGAGCTAGTTTCTTAGTCTCATTAGGCTTCAGTCACCATCATTTCTTATAGGAATACCA')
    assert kevlar.align(target, query) == ('10D91M69D79M20I', 155)
Example #4
0
def test_gap_center_aligned(contig, gdna, newcigar, origcigar, nblocks):
    qfile = kevlar.open(data_file('cigar/' + contig), 'r')
    tfile = kevlar.open(data_file('cigar/' + gdna), 'r')
    query = next(kevlar.parse_augmented_fastx(qfile))
    target = next(kevlar.parse_augmented_fastx(tfile))
    cigar, score = kevlar.align(target.sequence, query.sequence)
    tok = AlignmentTokenizer(query.sequence, target.sequence, cigar)
    assert len(tok.blocks) == nblocks
    assert tok._cigar == newcigar
    assert tok._origcigar == origcigar
Example #5
0
def align_both_strands(targetseq,
                       queryseq,
                       match=1,
                       mismatch=2,
                       gapopen=5,
                       gapextend=0):
    cigar1, score1 = kevlar.align(targetseq, queryseq, match, mismatch,
                                  gapopen, gapextend)
    cigar2, score2 = kevlar.align(targetseq, kevlar.revcom(queryseq), match,
                                  mismatch, gapopen, gapextend)

    if score2 > score1:
        cigar = cigar2
        score = score2
        strand = -1
    else:
        cigar = cigar1
        score = score1
        strand = 1
    return cigar, score, strand
Example #6
0
def test_gap_center_aligned():
    query = next(
        kevlar.parse_augmented_fastx(
            kevlar.open(data_file('cigar/b.contig.fa'), 'r')))
    target = next(
        kevlar.parse_augmented_fastx(
            kevlar.open(data_file('cigar/b.gdna.fa'), 'r')))
    cigar, score = kevlar.align(target.sequence, query.sequence)
    tok = AlignmentTokenizer(query.sequence, target.sequence, cigar)
    assert len(tok.blocks) == 3
    assert tok._cigar == '41D150M50D'
    assert tok._origcigar == '41D144M50D6M'
Example #7
0
def test_blocks(contig, gdna):
    query = next(kevlar.parse_augmented_fastx(kevlar.open(contig, 'r')))
    target = next(kevlar.parse_augmented_fastx(kevlar.open(gdna, 'r')))
    cigar, score = kevlar.align(target.sequence, query.sequence)
    tok = AlignmentTokenizer(query.sequence, target.sequence, cigar)
    for block in tok.blocks:
        assert block.type in ('M', 'D', 'I')
        if block.type in ('M', 'D'):
            assert len(block.target) == block.length
        else:
            assert block.target is None
        if block.type in ('M', 'I'):
            assert len(block.query) == block.length
        else:
            assert block.query is None