Esempio n. 1
0
def test_parse_single_partition():
    infile = data_file('part-reads-simple.fa')

    readstream = kevlar.parse_augmented_fastx(kevlar.open(infile, 'r'))
    partitions = list(kevlar.parse_single_partition(readstream, '1'))
    partitions = [part for partid, part in partitions]
    assert len(partitions) == 1
    assert len(partitions[0]) == 4

    readstream = kevlar.parse_augmented_fastx(kevlar.open(infile, 'r'))
    partitions = list(kevlar.parse_single_partition(readstream, '2'))
    partitions = [part for partid, part in partitions]
    assert len(partitions) == 1
    assert len(partitions[0]) == 2

    readstream = kevlar.parse_augmented_fastx(kevlar.open(infile, 'r'))
    partitions = list(kevlar.parse_single_partition(readstream, 'alFrED'))
    partitions = [part for partid, part in partitions]
    assert partitions == []
Esempio n. 2
0
def test_sim_var_geno():
    seqstream = kevlar.open(data_file('100kbx3.fa.gz'), 'r')
    sequences = kevlar.seqio.parse_seq_dict(seqstream)
    simulator = kevlar.gentrio.simulate_variant_genotypes(sequences,
                                                          ninh=2,
                                                          ndenovo=2,
                                                          rng=112358 ^ 853211)

    variants = list(simulator)
    seqids = [v.seqid for v in variants]
    positions = [v.position for v in variants]
    genotypes = [v.genotypes for v in variants]

    print('DEBUG', seqids, positions, genotypes)

    assert len(variants) == 4
    assert seqids == ['scaf3', 'scaf3', 'scaf1', 'scaf2']
    assert positions == [4936, 57391, 67028, 88584]
    assert genotypes == [('0/1', '0/1', '1/0'), ('1/1', '1/1', '1/1'),
                         ('1/0', '0/0', '0/0'), ('0/1', '0/0', '0/0')]
Esempio n. 3
0
def test_load_reads_and_kmers():
    """Make sure augmented records are loaded correctly."""
    instream = open(data_file('var1.reads.augfastq'), 'r')
    reads, kmers = load_reads_and_kmers(instream, logstream=None)
    assert len(reads) == 10
    assert len(kmers) == 7

    readname = 'read8f start=8,mutations=0'
    assert reads[readname].sequence == ('CACTGTCCTTACAGGTGGATAGTCGCTTTGTAATAAA'
                                        'AGAGTTACACCCCGGTTTTTAGAAGTCTCGACTTTAA'
                                        'GGAAGTGGGCCTACGGCGGAAGCCGT')

    testset = set([
        'read2f start=13,mutations=0', 'read8f start=8,mutations=0',
        'read10f start=34,mutations=0', 'read13f start=49,mutations=1',
        'read15f start=54,mutations=1', 'read16f start=13,mutations=1',
        'read22f start=5,mutations=0', 'read35f start=25,mutations=0',
        'read37f start=9,mutations=0'
    ])
    assert kmers['CCGGTTTTTAGAAGTCTCGACTTTAAGGA'] == testset
Esempio n. 4
0
def test_gentrio_smoketest():
    seqstream = kevlar.open(data_file('100kbx3.fa.gz'), 'r')
    sequences = kevlar.seqio.parse_seq_dict(seqstream)
    outstreams = [StringIO(), StringIO(), StringIO()]
    mutator = kevlar.gentrio.gentrio(sequences,
                                     outstreams,
                                     ninh=2,
                                     ndenovo=1,
                                     seed=1985)
    variants = list(mutator)

    for variant in variants:
        print(variant.vcf, file=sys.stderr)

    for i in range(3):
        outstreams[i].seek(0)
    probandseqs = kevlar.seqio.parse_seq_dict(outstreams[0])
    motherseqs = kevlar.seqio.parse_seq_dict(outstreams[1])
    fatherseqs = kevlar.seqio.parse_seq_dict(outstreams[2])

    print(probandseqs['scaf1_haplo1'][variants[0].position])
    print(probandseqs['scaf1_haplo2'][variants[0].position])
    assert variants[0].genotypes[0] == '0/1'
    assert variants[0].refrwindow in probandseqs['scaf1_haplo1']
    assert variants[0].refrwindow not in probandseqs['scaf1_haplo2']
    assert variants[0].window not in probandseqs['scaf1_haplo1']
    assert variants[0].window in probandseqs['scaf1_haplo2']

    print(probandseqs['scaf3_haplo1'][variants[2].position])
    print(probandseqs['scaf3_haplo2'][variants[2].position])
    print(motherseqs['scaf3_haplo1'][variants[2].position])
    print(motherseqs['scaf3_haplo2'][variants[2].position])
    print(fatherseqs['scaf3_haplo1'][variants[2].position])
    print(fatherseqs['scaf3_haplo2'][variants[2].position])
    assert variants[2].window in probandseqs['scaf3_haplo1']
    assert variants[2].refrwindow in probandseqs['scaf3_haplo2']
    assert variants[2].refrwindow in motherseqs['scaf3_haplo1']
    assert variants[2].refrwindow in motherseqs['scaf3_haplo2']
    assert variants[2].refrwindow in fatherseqs['scaf3_haplo1']
    assert variants[2].window in fatherseqs['scaf3_haplo2']
Esempio n. 5
0
def test_load_mutations_y():
    instream = kevlar.open(data_file('muts-y.tsv'), 'r')
    mutations = kevlar.mutate.load_mutations(instream, stderr)
    assert len(mutations) == 3

    assert 'scaffold399' in mutations
    assert len(mutations['scaffold399']) == 1
    mut = mutations['scaffold399'][0]
    assert mut == Mutation(seq='scaffold399', pos=685357, type='ins',
                           data='AGCTACCCCAGTGAGTCGGTAATGTGATC')

    assert 'scaffold982' in mutations
    assert len(mutations['scaffold982']) == 1
    mut = mutations['scaffold982'][0]
    assert mut == Mutation(seq='scaffold982', pos=108754, type='del',
                           data='23')

    assert 'scaffold1102' in mutations
    assert len(mutations['scaffold1102']) == 1
    mut = mutations['scaffold1102'][0]
    assert mut == Mutation(seq='scaffold1102', pos=260686, type='snv',
                           data='1')
Esempio n. 6
0
def test_augfastx_reader_e1():
    infilename = data_file('example1.augfastq')
    infile = open(infilename, 'r')
    record = next(kevlar.parse_augmented_fastx(infile))

    assert record.name == 'e1'
    assert record.sequence == (
        'TTAACTCTAGATTAGGGGCGTGACTTAATAAGGTGTGGGCCTAAGCGTCT'
    )
    assert len(record.annotations) == 2

    ikmer = record.annotations[0]
    assert record.ikmerseq(ikmer) == 'AGGGGCGTGACTTAATAAG'
    assert ikmer.ksize == 19
    assert ikmer.offset == 13
    assert ikmer.abund == (12, 15, 1, 1)

    ikmer = record.annotations[1]
    assert record.ikmerseq(ikmer) == 'GGGCGTGACTTAATAAGGT'
    assert ikmer.ksize == 19
    assert ikmer.offset == 15
    assert ikmer.abund == (20, 28, 0, 1)
Esempio n. 7
0
def test_call_homopolymers_mixed_results():
    contigfile = data_file('homopolymer/12175-3parts.contigs.augfasta')
    contigstream = kevlar.parse_augmented_fastx(kevlar.open(contigfile, 'r'))
    partstream = kevlar.parse_partitioned_reads(contigstream)
    contigs = kevlar.call.load_contigs(partstream)

    gdnafile = data_file('homopolymer/12175-3parts.targets.fasta')
    gdnastream = kevlar.reference.load_refr_cutouts(kevlar.open(gdnafile, 'r'))
    partstream = kevlar.parse_partitioned_reads(gdnastream)
    targets = kevlar.call.load_contigs(partstream)

    prelimcalls = list()
    for partid in contigs:
        contiglist = contigs[partid]
        gdnalist = targets[partid]
        caller = kevlar.call.call(gdnalist, contiglist, partid=partid)
        prelimcalls.extend(list(caller))

    kid = kevlar.sketch.load(data_file('homopolymer/12175-kid.sct'))
    mom = kevlar.sketch.load(data_file('homopolymer/12175-mom.sct'))
    dad = kevlar.sketch.load(data_file('homopolymer/12175-dad.sct'))
    refr = kevlar.sketch.load(data_file('homopolymer/12175-refr.sct'))
    scorer = kevlar.simlike.simlike(
        prelimcalls, kid, [mom, dad], refr,
        samplelabels=['Proband', 'Mother', 'Father'],
    )
    calls = list(scorer)

    assert len(calls) == 6
    for c in calls:
        print(c.vcf)
    unintrstng = [c for c in calls if c.filterstr in ('PASS', 'Homopolymer')]
    assert len(unintrstng) == 3

    call1, call2, call3 = unintrstng
    assert call1.position == 123651924
    assert call1.filterstr == 'PASS'  # negative control
    assert call1._refr == 'TAA'
    assert call1._alt == 'T'
    assert call2.position == 124641259
    assert call2.filterstr == 'PASS'  # borderline
    assert call2._refr == 'TAAA'
    assert call2._alt == 'T'
    assert call3.position == 128660727
    assert call3.filterstr == 'Homopolymer'  # positive control
Esempio n. 8
0
def test_augfastx_reader_e2():
    infilename = data_file('example2.augfastq')
    infile = open(infilename, 'r')
    record = next(kevlar.parse_augmented_fastx(infile))

    assert record.name == 'ERR894724.125497791/1'
    assert record.sequence == (
        'TAGCCAGTTTGGGTAATTTTAATTGTAAAACTTTTTTTTCTTTTTTTTTGATTTTTTTTTTTCAAGCAG'
        'AAGACGGCATACGAGCTCTTTTCACGTGACTGGAGTTCAGACGTGTGCTCTTCCGAT'
    )
    assert len(record.annotations) == 2

    ikmer = record.annotations[0]
    assert record.ikmerseq(ikmer) == 'GGCATACGAGCTCTTTTCACGTGACTGGAGT'
    assert ikmer.ksize == 31
    assert ikmer.offset == 74
    assert ikmer.abund == (23, 0, 0)

    ikmer = record.annotations[1]
    assert record.ikmerseq(ikmer) == 'GCTCTTTTCACGTGACTGGAGTTCAGACGTG'
    assert ikmer.ksize == 31
    assert ikmer.offset == 83
    assert ikmer.abund == (23, 0, 0)
Esempio n. 9
0
def test_get_seed_matches():
    seedfasta = (
        '>seed0\nATCTGTTCTTGGCCAATAGAAAAAGCAAGGAGCCCTGAAAGACTCACAGTG\n'
        '>seed1\nAAAAGGAAATGTTAACAACAAAATCACACAGATAAACCATCACAAGATCTG\n'
        '>seed2\nGATTCTAGGAGCTTGTTACTGCTGCTGAAAAAGGAAATGTTAACAACAAAA\n'
        '>seed3\nAACCAATAGAGGTCCACAGAAGTATATATAATCTGTTCTTGGCCAATAGAA\n'
        '>seed4\nTTGTGTGTAAAAACCAATAGAGGTCCACAGAAGTATATATAATCTGTTCTT\n'
        '>seed5\nAAGATACTATAATATGTTTCCCTGAGCACACCCCTTCGAAAGAGCAGAATT\n')
    with NamedTemporaryFile(suffix='.fa', mode='w') as seedfile:
        print(seedfasta, file=seedfile, flush=True)
        refrfile = data_file('fiveparts-refr.fa.gz')
        seed_matches = get_seed_matches(seedfile.name, refrfile, seedsize=51)
        print(seed_matches)
        assert seed_matches == {
            'AACCAATAGAGGTCCACAGAAGTATATATAATCTGTTCTTGGCCAATAGAA':
            {('seq1', 284819)},
            'AAGATACTATAATATGTTTCCCTGAGCACACCCCTTCGAAAGAGCAGAATT':
            {('seq1', 284722)},
            'ATCTGTTCTTGGCCAATAGAAAAAGCAAGGAGCCCTGAAAGACTCACAGTG':
            {('seq1', 284849)},
            'AAGAACAGATTATATATACTTCTGTGGACCTCTATTGGTTTTTACACACAA':
            {('seq1', 284808)},
        }
Esempio n. 10
0
def test_graph_init():
    """Test graph initialization."""
    instream = kevlar.open(data_file('var1.reads.augfastq'), 'r')
    graph = kevlar.ReadGraph()
    graph.load(kevlar.parse_augmented_fastx(instream))
    graph.populate_edges(strict=True)

    # 10 reads in the file, but read16f has no valid connections due to error
    assert len(graph.nodes()) == 10

    # The given read shares its interesting k-mer and has compatible overlaps
    # with 6 other reads (read13f and read15f have errors).
    r23name = 'read23f start=67,mutations=0'
    assert len(graph[r23name]) == 6

    # Test the values of one of the edges.
    r35name = 'read35f start=25,mutations=0'
    assert graph[r23name][r35name]['offset'] == 42
    assert graph[r23name][r35name]['overlap'] == 58

    # Should all be a single CC
    assert len(list(connected_components(graph))) == 2
    assert len([p for p in graph.partitions()]) == 1

    r8name = 'read8f start=8,mutations=0'
    r37name = 'read37f start=9,mutations=0'
    assert graph[r37name][r8name]['offset'] == 1
    assert graph[r37name][r8name]['overlap'] == 99
    pair = OverlappingReadPair(tail=graph.get_record(r8name),
                               head=graph.get_record(r37name),
                               offset=1,
                               overlap=99,
                               sameorient=True,
                               swapped=False)
    assert merge_pair(pair) == ('CACTGTCCTTACAGGTGGATAGTCGCTTTGTAATAAAAGAGTTAC'
                                'ACCCCGGTTTTTAGAAGTCTCGACTTTAAGGAAGTGGGCCTACGG'
                                'CGGAAGCCGTC')
Esempio n. 11
0
def test_augfastx_reader_withmates():
    instream = kevlar.open(data_file('seqs-mates.augfastq'), 'r')
    reader = kevlar.parse_augmented_fastx(instream)

    record = next(reader)
    assert len(record.annotations) == 5
    assert len(record.mates) == 1
    assert record.mates[0].startswith('CTGATAAGCAACTTCAGCAAA')

    record = next(reader)
    assert len(record.annotations) == 4
    assert len(record.mates) == 1
    assert record.mates[0].startswith('ATTAGAAAAAAAAAGTGCATT')

    record = next(reader)
    assert len(record.annotations) == 21
    assert len(record.mates) == 0

    record = next(reader)
    assert len(record.annotations) == 2
    assert record.mates[0].startswith('CAGATGTGTCTTGTGGGCAGT')

    with pytest.raises(StopIteration):
        next(reader)
Esempio n. 12
0
def test_simlike_cli(fmtstr, sampleargs, minitrio, capsys):
    kid, mom, dad, ref = minitrio
    with NamedTemporaryFile(suffix='.ct') as kidct, \
            NamedTemporaryFile(suffix='.ct') as momct, \
            NamedTemporaryFile(suffix='.ct') as dadct, \
            NamedTemporaryFile(suffix='.sct') as refrsct:
        kid.save(kidct.name)
        mom.save(momct.name)
        dad.save(dadct.name)
        ref.save(refrsct.name)

        arglist = [
            'simlike', '--case', kidct.name,
            '--controls', momct.name, dadct.name, *sampleargs,
            '--refr', refrsct.name, data_file('minitrio/calls.vcf')
        ]
        print(arglist)
        args = kevlar.cli.parser().parse_args(arglist)
        kevlar.simlike.main(args)

    out, err = capsys.readouterr()
    assert fmtstr in out
    assert 'LIKESCORE=214.103' in out
    assert 'LLDN=-221.908;LLFP=-785.714;LLIH=-436.011' in out
Esempio n. 13
0
def test_load_sample_seqfile(count, smallcount, extension, shortext):
    infile = data_file('bogus-genome/refr.fa')
    with NamedTemporaryFile() as outfile:
        sketch = kevlar.count.load_sample_seqfile([infile],
                                                  21,
                                                  1e6,
                                                  count=count,
                                                  smallcount=smallcount,
                                                  outfile=outfile.name)
        assert sketch.get('GAATCGGTGGCTGGTTGCCGT') > 0
        assert sketch.get('GATTACAGATTACAGATTACA') == 0
        assert os.path.exists(outfile.name + extension)

    with NamedTemporaryFile(suffix=shortext) as outfile:
        sketch = kevlar.count.load_sample_seqfile([infile],
                                                  21,
                                                  1e6,
                                                  count=count,
                                                  smallcount=smallcount,
                                                  outfile=outfile.name)
        assert sketch.get('GAATCGGTGGCTGGTTGCCGT') > 0
        assert sketch.get('GATTACAGATTACAGATTACA') == 0
        assert not os.path.exists(outfile.name + extension)
        assert os.path.exists(outfile.name)
Esempio n. 14
0
def test_gen_muts():
    seqstream = kevlar.open(data_file('100kbx3.fa.gz'), 'r')
    sequences = kevlar.seqio.parse_seq_dict(seqstream)
    w = {'snv': 0.7, 'ins': 0.15, 'del': 0.15}
    mutator = kevlar.gentrio.generate_mutations(sequences, weights=w, rng=42)
    mutations = list(mutator)

    refrs = [m._refr for m in mutations]
    alts = [m._alt for m in mutations]

    print('DEBUG refrs', refrs, file=sys.stderr)
    print('DEBUG alts', alts, file=sys.stderr)

    testrefrs = [
        'ATTACGACAGAGTTTGTAGGTGTACGAGCCCAATCCAACGTCGGCCATCCGAGACTCTTTAAGTACCCG'
        'GCCATACACTGTGCGCCGAAAAATCAGCGATCATACCACCGTTTGAAGCTTCACGGCCGAGTGTTCTGG'
        'CGATTCGT', 'TATATGAGCTCTCGACGGAATTTACGAGCGCGTATAAGCCTTTTGCAGTTACAACAT'
        'T', 'A', 'GAGTTGGGTATAATAACGTAGTCGGGGGAGCAGATGGAGCAGTGCGACCGCCG', 'C',
        'G', 'A', 'T', 'G', 'C'
    ]
    testalts = [
        'A', 'T', 'C', 'G', 'G', 'C', 'ATGCGCAGAGGATATGTTAGTGACTATTGAAGGTGGAAC'
        'TTGCAAGGGAATGGGTTCACCCTTGCGATTTCGGGGCTACTAAGCACATAGGCTAACGGCAGATGGAGT'
        'AAGCTACGCCAAAACTAATTAGCGTGCTCGGGGCGTAGGCGGGACCCCGGAAATGATAACCAGGATCAA'
        'ACATCCCTTCTTCGACCGAAGGCTGTTGCGCACGTATGACAGCTCTGTGACGCTCTAGATTCAGCTTTG'
        'AAGTCGTGACACGTTGCGATACCTTGACCTGGATGAAACTTCGCCGGGACTTCCCTGACAA', 'TTTG'
        'TTCCCATGACTTACGCTACACACGAGCCAGCTAGCTGCGAAAACCTAAGAGCCTCCG', 'A', 'CTA'
        'GCGAAACACGGAATAACATCAAATGACAGCTATCTCCCAAGATGGTGGGTAGGTTTATAGTAGAGTGGG'
        'CGGCTACATTCGTCTCCCCGGCCCGCAGCCCGCGCACTATAGCAAAATGTTAATGCAGGTTCTGCCCTC'
        'CATATAGATCACACGCTAAGTCAAAATACGACCCTGTGACCAGCCGCAATCACTTGCTGAATTCCGCAC'
        'CTTGCTCCAGCGACTATCTTCTTCCTTAAGCCCCTGGT'
    ]

    assert refrs == testrefrs
    assert alts == testalts
    assert mutations[0].genotypes is None
Esempio n. 15
0
def test_no_refr():
    bamstream = kevlar.open(data_file('bogus-genome/reads.bam'), 'r')
    records = [r for r in kevlar.dump.dump(bamstream)]
    assert len(records) == 8
Esempio n. 16
0
                                          filtermem=1e7,
                                          filterfpr=0.005,
                                          logstream=sys.stderr)
        variants = [v for v in workflow]
    variants = sorted(variants, key=lambda v: v._pos)
    startpos = [v._pos + 1 for v in variants]
    teststartpos = [
        4073, 185752, 226611, 636699, 834646, 901124, 1175768, 1527139,
        1631013, 2265795
    ]
    assert len(variants) == 10
    assert startpos == teststartpos


def test_simplex_trio1(capsys):
    case = data_file('trio1/case1.fq')
    controls = data_glob('trio1/ctrl[1,2].fq')
    refr = data_file('bogus-genome/refr.fa')
    arglist = [
        'simplex', '--case', case, '--control', controls[0], '--control',
        controls[1], '--case-min', '6', '--ctrl-max', '0', '--novel-memory',
        '1M', '--novel-fpr', '0.2', '--filter-memory', '50K', '--mask-files',
        refr, '--mask-memory', '1M', '--filter-fpr', '0.005', '--ksize', '21',
        refr
    ]
    args = kevlar.cli.parser().parse_args(arglist)
    kevlar.simplex.main(args)

    out, err = capsys.readouterr()
    testvcf = '\t'.join([
        'bogus-genome-chr1', '3567', '.', 'A', 'C', '.', 'PASS', 'RW=GAAGGGCAC'
Esempio n. 17
0
def test_sketch_load_badfilename():
    infile = data_file('test.notasketchtype')
    with pytest.raises(kevlar.sketch.KevlarSketchTypeError) as kste:
        sketch = kevlar.sketch.load(infile)
    assert ('sketch type from filename ' + infile) in str(kste)
Esempio n. 18
0
def test_sketch_load(filename, testkmer):
    infile = data_file(filename)
    sketch = kevlar.sketch.load(infile)
    assert sketch.get(testkmer) > 0
    assert sketch.get('GATTACA' * 3) == 0
Esempio n. 19
0
def test_mutate_bogus():
    instream = kevlar.open(data_file('muts-w.txt'), 'r')
    with pytest.raises(ValueError) as ve:
        mutations = kevlar.mutate.load_mutations(instream, stderr)
    assert 'invalid variant type "slippage"' in str(ve)
Esempio n. 20
0
def test_load_mutations_z():
    instream = kevlar.open(data_file('muts-z.csv'), 'r')
    with pytest.raises(ValueError) as ve:
        mutations = kevlar.mutate.load_mutations(instream, stderr)
    assert 'error parsing mutation' in str(ve)
Esempio n. 21
0
def test_reader_format_mismatch(filename, errormsg):
    instream = kevlar.open(data_file(filename), 'r')
    reader = kevlar.vcf.VCFReader(instream)
    with pytest.raises(kevlar.vcf.VariantAnnotationError, match=errormsg):
        calls = list(reader)
Esempio n. 22
0
def test_assembly_edgeless(cc):
    filename = 'edgeless/cc{:d}.afq.gz'.format(cc)
    fh = kevlar.open(data_file(filename), 'r')
    reads = [r for r in kevlar.parse_augmented_fastx(fh)]
    contigs = [c for c in kevlar.assembly.fml_asm(reads)]
    assert len(contigs) == 0
Esempio n. 23
0
    record.annotate('GATGAGGATGAGGATGAGGATGAGG', 8, (11, 1, 0))

    kevlar.print_augmented_fastx(record, stdout)
    out, err = capsys.readouterr()
    assert read in out


def test_iter_read_multi_file():
    infiles = kevlar.tests.data_glob('bogus-genome/mask-chr[1,2].fa')
    print(infiles)
    records = [r for r in kevlar.multi_file_iter_khmer(infiles)]
    assert len(records) == 4


def test_novel_abund_screen(capsys):
    case = data_file('screen-case.fa')
    ctrl = data_file('screen-ctrl.fa')
    arglist = [
        'novel', '--ksize', '25', '--ctrl-max', '1', '--case-min', '8',
        '--case', case, '--control', ctrl, '--abund-screen', '3'
    ]
    args = kevlar.cli.parser().parse_args(arglist)
    kevlar.novel.main(args)

    out, err = capsys.readouterr()
    assert '>seq_error' not in out


def test_skip_until(capsys):
    readname = 'bogus-genome-chr1_115_449_0:0:0_0:0:0_1f4/1'
    case = data_file('trio1/case1.fq')
Esempio n. 24
0
# -----------------------------------------------------------------------------
# Copyright (c) 2018 The Regents of the University of California
#
# This file is part of kevlar (http://github.com/dib-lab/kevlar) and is
# licensed under the MIT license: see LICENSE.
# -----------------------------------------------------------------------------

import sys
import kevlar
from kevlar.cigar import AlignmentBlock, AlignmentTokenizer
from kevlar.tests import data_file
import pytest


@pytest.mark.parametrize('contig,gdna', [
    (data_file('cigar/a.contig.fa'), data_file('cigar/a.gdna.fa')),
    (data_file('cigar/b.contig.fa'), data_file('cigar/b.gdna.fa')),
    (data_file('cigar/c.contig.fa'), data_file('cigar/c.gdna.fa')),
    (data_file('phony-snv-01.contig.fa'), data_file('phony-snv-01.gdna.fa')),
    (data_file('phony-snv-02.contig.fa'), data_file('phony-snv-02.gdna.fa')),
])
def test_blocks(contig, gdna):
    query = next(kevlar.parse_augmented_fastx(kevlar.open(contig, 'r')))
    target = next(kevlar.parse_augmented_fastx(kevlar.open(gdna, 'r')))
    cigar, score = kevlar.align(target.sequence, query.sequence)
    tok = AlignmentTokenizer(query.sequence, target.sequence, cigar)
    for block in tok.blocks:
        assert block.type in ('M', 'D', 'I')
        if block.type in ('M', 'D'):
            assert len(block.target) == block.length
        else:
Esempio n. 25
0
def test_sketch_load_badfilename():
    infile = data_file('test.notasketchtype')
    errormsg = r'sketch type from filename ' + infile
    with pytest.raises(kevlar.sketch.KevlarSketchTypeError, match=errormsg):
        sketch = kevlar.sketch.load(infile)
Esempio n. 26
0
    badkmers = ['CAGGCCAGGGATCGCCGTG']
    goodkmers = [
        'AGGGGCGTGACTTAATAAG', 'GGGCGTGACTTAATAAGGT',
        'TAGGGGCGTGACTTAATAA', 'GGGGCGTGACTTAATAAGG',
    ]
    for record in validated:
        for kmer in record.annotations:
            seq = record.ikmerseq(kmer)
            assert seq not in badkmers and kevlar.revcom(seq) not in badkmers
            assert seq in goodkmers or kevlar.revcom(seq) in goodkmers


@pytest.mark.parametrize('mask,nkmers,nkmerinstances', [
    (None, 424, 5782),
    (bogusrefr(), 424, 5782),
    (kevlar.sketch.load(data_file('bogus-genome/mask.nt')), 13, 171)
])
def test_ctrl3(mask, nkmers, nkmerinstances):
    readfile = data_file('trio1/novel_3_1,2.txt')
    ikmers = defaultdict(int)
    for read in kevlar.filter.filter(readfile, memory=1e7, mask=mask):
        for ikmer in read.annotations:
            kmerseq = kevlar.revcommin(read.ikmerseq(ikmer))
            ikmers[kmerseq] += 1
    assert len(ikmers) == nkmers
    assert sum(ikmers.values()) == nkmerinstances


def test_filter_abundfilt():
    readfile = data_file('worm.augfasta')
    ikmers = defaultdict(int)
Esempio n. 27
0
def test_ikmer_filter_cli():
    reads = data_file('min_ikmers_filt.augfastq.gz')
    refr = data_file('localize-refr.fa')
    arglist = ['alac', '--ksize', '31', '--min-ikmers', '3', reads, refr]
    args = kevlar.cli.parser().parse_args(arglist)
    kevlar.alac.main(args)
Esempio n. 28
0
def test_assemble_no_edges(capsys):
    cliargs = ['assemble', data_file('asmbl-no-edges.augfastq.gz')]
    args = kevlar.cli.parser().parse_args(cliargs)
    kevlar.assemble.main(args)
    out, err = capsys.readouterr()
    assert out == ''
Esempio n. 29
0
def test_bwa_failure():
    args = ['bwa', 'mem', data_file('not-a-real-file.fa'), '-']
    with pytest.raises(KevlarBWAError) as e:
        aligner = kevlar.reference.bwa_align(args, '>seq1\nACGT')
        pos = list(aligner)
Esempio n. 30
0
def test_assemble_no_edges(capsys):
    cliargs = ['assemble', data_file('asmbl-no-edges.augfastq.gz')]
    args = kevlar.cli.parser().parse_args(cliargs)
    with pytest.raises(kevlar.assemble.KevlarEdgelessGraphError) as ege:
        kevlar.assemble.main(args)
    assert 'nothing to be done, aborting' in str(ege)