def call(targetlist, querylist, match=1, mismatch=2, gapopen=5, gapextend=0, ksize=31): """ Wrap the `kevlar call` procedure as a generator function. Input is the following. - an iterable containing one or more target sequences from the reference genome, stored as khmer or screed sequence records - an iterable containing one or more contigs assembled by kevlar, stored as khmer or screed sequence records - alignment match score (integer) - alignment mismatch penalty (integer) - alignment gap open penalty (integer) - alignment gap extension penalty (integer) The function yields tuples of target sequence name, query sequence name, and alignment CIGAR string """ for target in sorted(targetlist, key=lambda record: record.name): for query in sorted(querylist, reverse=True, key=len): cigar = kevlar.align(target.sequence, query.sequence, match, mismatch, gapopen, gapextend) for varcall in make_call(target, query, cigar, ksize): yield varcall
def test_nomargin(): qfile = kevlar.open(data_file('nomargin-r-indel-contigs.augfasta'), 'r') tfile = kevlar.open(data_file('nomargin-r-gdna.fa'), 'r') query = next(kevlar.parse_augmented_fastx(qfile)) target = next(kevlar.parse_augmented_fastx(tfile)) cigar, score = kevlar.align(target.sequence, query.sequence) tok = AlignmentTokenizer(query.sequence, target.sequence, cigar) assert tok._cigar == tok._origcigar
def test_align(): """Smoke test for ksw2 aligner""" target = ('TAAATAAATATCTGGTGTTTGAGGCAAAAAGGCAGACTTAAATTCTAAATCACACCTGTGCTT' 'CCAGCACTACCTTCAAGCGCAGGTTCGAGCCAGTCAGGCAGGGTACATAAGAGTCCATTGTGC' 'CTGTATTATTTTGAGCAATGGCTAAAGTACCTTCACCCTTGCTCACTGCTCCCCCACTTCCTC' 'AAGTCTCATCGTGTTTTTTTTAGAGCTAGTTTCTTAGTCTCATTAGGCTTCAGTCACCAT') query = ('TCTGGTGTTTGAGGCAAAAAGGCAGACTTAAATTCTAAATCACACCTGTGCTTCCAGCACTACC' 'TTCAAGCGCAGGTTCGAGCCAGTCAGGACTGCTCCCCCACTTCCTCAAGTCTCATCGTGTTTTT' 'TTTAGAGCTAGTTTCTTAGTCTCATTAGGCTTCAGTCACCATCATTTCTTATAGGAATACCA') assert kevlar.align(target, query) == ('10D91M69D79M20I', 155)
def test_gap_center_aligned(contig, gdna, newcigar, origcigar, nblocks): qfile = kevlar.open(data_file('cigar/' + contig), 'r') tfile = kevlar.open(data_file('cigar/' + gdna), 'r') query = next(kevlar.parse_augmented_fastx(qfile)) target = next(kevlar.parse_augmented_fastx(tfile)) cigar, score = kevlar.align(target.sequence, query.sequence) tok = AlignmentTokenizer(query.sequence, target.sequence, cigar) assert len(tok.blocks) == nblocks assert tok._cigar == newcigar assert tok._origcigar == origcigar
def align_both_strands(targetseq, queryseq, match=1, mismatch=2, gapopen=5, gapextend=0): cigar1, score1 = kevlar.align(targetseq, queryseq, match, mismatch, gapopen, gapextend) cigar2, score2 = kevlar.align(targetseq, kevlar.revcom(queryseq), match, mismatch, gapopen, gapextend) if score2 > score1: cigar = cigar2 score = score2 strand = -1 else: cigar = cigar1 score = score1 strand = 1 return cigar, score, strand
def test_gap_center_aligned(): query = next( kevlar.parse_augmented_fastx( kevlar.open(data_file('cigar/b.contig.fa'), 'r'))) target = next( kevlar.parse_augmented_fastx( kevlar.open(data_file('cigar/b.gdna.fa'), 'r'))) cigar, score = kevlar.align(target.sequence, query.sequence) tok = AlignmentTokenizer(query.sequence, target.sequence, cigar) assert len(tok.blocks) == 3 assert tok._cigar == '41D150M50D' assert tok._origcigar == '41D144M50D6M'
def test_blocks(contig, gdna): query = next(kevlar.parse_augmented_fastx(kevlar.open(contig, 'r'))) target = next(kevlar.parse_augmented_fastx(kevlar.open(gdna, 'r'))) cigar, score = kevlar.align(target.sequence, query.sequence) tok = AlignmentTokenizer(query.sequence, target.sequence, cigar) for block in tok.blocks: assert block.type in ('M', 'D', 'I') if block.type in ('M', 'D'): assert len(block.target) == block.length else: assert block.target is None if block.type in ('M', 'I'): assert len(block.query) == block.length else: assert block.query is None