Beispiel #1
0
def test_split_cli():
    infile = data_file('fiveparts.augfastq.gz')
    tempdir = mkdtemp()
    print(tempdir)
    arglist = ['split', infile, '3', tempdir + '/out']
    args = kevlar.cli.parser().parse_args(arglist)
    kevlar.split.main(args)

    outfile = tempdir + '/out.0.augfastx.gz'
    readstream = kevlar.parse_augmented_fastx(kevlar.open(outfile, 'r'))
    partstream = kevlar.parse_partitioned_reads(readstream)
    partitions = list(partstream)
    partitions = [part for partid, part in partitions]
    assert len(partitions) == 2
    assert len(partitions[0]) == 67
    assert len(partitions[1]) == 12

    outfile = tempdir + '/out.1.augfastx.gz'
    readstream = kevlar.parse_augmented_fastx(kevlar.open(outfile, 'r'))
    partstream = kevlar.parse_partitioned_reads(readstream)
    partitions = list(partstream)
    partitions = [part for partid, part in partitions]
    assert len(partitions) == 2
    assert len(partitions[0]) == 23
    assert len(partitions[1]) == 11

    outfile = tempdir + '/out.2.augfastx.gz'
    readstream = kevlar.parse_augmented_fastx(kevlar.open(outfile, 'r'))
    partstream = kevlar.parse_partitioned_reads(readstream)
    partitions = list(partstream)
    partitions = [part for partid, part in partitions]
    assert len(partitions) == 1
    assert len(partitions[0]) == 15

    rmtree(tempdir)
Beispiel #2
0
def main(args):
    augseqs = kevlar.parse_augmented_fastx(kevlar.open(args.augseqs, 'r'))
    nakedseqs = kevlar.parse_augmented_fastx(kevlar.open(args.seqs, 'r'))
    outstream = kevlar.open(args.out, 'w')
    docollapse = args.collapse_mates
    for record in augment(augseqs, nakedseqs, collapsemates=docollapse):
        kevlar.print_augmented_fastx(record, outstream)
Beispiel #3
0
def test_novel_output_has_mates():
    kid = data_file('microtrios/trio-na-proband.fq.gz')
    mom = data_file('microtrios/trio-na-mother.fq.gz')
    dad = data_file('microtrios/trio-na-father.fq.gz')
    testnovel = data_file('microtrios/novel-na.augfastq.gz')

    with NamedTemporaryFile(suffix='.augfastq') as novelfile:
        arglist = [
            'novel', '--out', novelfile.name, '--case', kid, '--case-min', '5',
            '--control', mom, '--control', dad, '--ctrl-max', '1', '--memory',
            '500K'
        ]
        args = kevlar.cli.parser().parse_args(arglist)
        kevlar.novel.main(args)

        intread_ids = set()
        mate_seqs = set()
        stream = kevlar.parse_augmented_fastx(kevlar.open(novelfile.name, 'r'))
        for read in stream:
            intread_ids.add(read.name)
            mate_seqs.update(read.mates)

        stream = kevlar.parse_augmented_fastx(kevlar.open(testnovel, 'r'))
        test_ids = set([r.name for r in stream])
        assert intread_ids == test_ids

        stream = kevlar.parse_augmented_fastx(kevlar.open(testnovel, 'r'))
        test_mate_seqs = set([m for r in stream for m in r.mates])
        assert mate_seqs == test_mate_seqs
Beispiel #4
0
def test_augment_contig_mates():
    augfh = kevlar.open(data_file('deadbeef.augfastq.gz'), 'r')
    augreads = kevlar.parse_augmented_fastx(augfh)
    nakedfh = kevlar.open(data_file('deadbeef.contig.fa'), 'r')
    nakedseq = kevlar.parse_augmented_fastx(nakedfh)
    contigs = list(augment(augreads, nakedseq))
    assert len(contigs) == 1
    assert len(contigs[0].annotations) == 74
Beispiel #5
0
def test_nomargin():
    qfile = kevlar.open(data_file('nomargin-r-indel-contigs.augfasta'), 'r')
    tfile = kevlar.open(data_file('nomargin-r-gdna.fa'), 'r')
    query = next(kevlar.parse_augmented_fastx(qfile))
    target = next(kevlar.parse_augmented_fastx(tfile))
    cigar, score = kevlar.align(target.sequence, query.sequence)
    tok = AlignmentTokenizer(query.sequence, target.sequence, cigar)
    assert tok._cigar == tok._origcigar
Beispiel #6
0
def test_augment_reads_mates():
    augfh = kevlar.open(data_file('deadbeef.augfastq.gz'), 'r')
    augreads = list(kevlar.parse_augmented_fastx(augfh))
    nakedfh = kevlar.open(data_file('deadbeef.fq.gz'), 'r')
    nakedseq = kevlar.parse_augmented_fastx(nakedfh)
    newreads = list(augment(augreads, nakedseq, upint=5))
    for oldread, newread in zip(augreads, newreads):
        assert oldread.sequence == newread.sequence
        assert oldread.annotations == newread.annotations
Beispiel #7
0
def test_gap_center_aligned(contig, gdna, newcigar, origcigar, nblocks):
    qfile = kevlar.open(data_file('cigar/' + contig), 'r')
    tfile = kevlar.open(data_file('cigar/' + gdna), 'r')
    query = next(kevlar.parse_augmented_fastx(qfile))
    target = next(kevlar.parse_augmented_fastx(tfile))
    cigar, score = kevlar.align(target.sequence, query.sequence)
    tok = AlignmentTokenizer(query.sequence, target.sequence, cigar)
    assert len(tok.blocks) == nblocks
    assert tok._cigar == newcigar
    assert tok._origcigar == origcigar
Beispiel #8
0
def test_augment_contigs():
    augfh = kevlar.open(data_file('snorkel.augfastq'), 'r')
    augreads = kevlar.parse_augmented_fastx(augfh)
    nakedfh = kevlar.open(data_file('snorkel-contig.fasta'), 'r')
    nakedseq = kevlar.parse_augmented_fastx(nakedfh)
    augseqs = list(augment(augreads, nakedseq))
    assert len(augseqs) == 1
    assert len(augseqs[0].annotations) == 3

    offsets = [k.offset for k in augseqs[0].annotations]
    assert offsets == [17, 20, 22]
Beispiel #9
0
def filter(readfile, mask=None, memory=1e6, maxfpr=0.01, casemin=6, ctrlmax=1):
    timer = kevlar.Timer()
    timer.start()
    reader = kevlar.parse_augmented_fastx(kevlar.open(readfile, 'r'))
    counts = first_pass(reader, mask, memory, timer)
    check_fpr(counts, maxfpr)
    reader = kevlar.parse_augmented_fastx(kevlar.open(readfile, 'r'))
    for read in second_pass(reader, counts, casemin, ctrlmax, timer):
        yield read
    total = timer.stop()
    message = 'Total time: {:.2f} seconds'.format(total)
    kevlar.plog('[kevlar::filter]', message)
Beispiel #10
0
def test_gap_center_aligned():
    query = next(
        kevlar.parse_augmented_fastx(
            kevlar.open(data_file('cigar/b.contig.fa'), 'r')))
    target = next(
        kevlar.parse_augmented_fastx(
            kevlar.open(data_file('cigar/b.gdna.fa'), 'r')))
    cigar, score = kevlar.align(target.sequence, query.sequence)
    tok = AlignmentTokenizer(query.sequence, target.sequence, cigar)
    assert len(tok.blocks) == 3
    assert tok._cigar == '41D150M50D'
    assert tok._origcigar == '41D144M50D6M'
Beispiel #11
0
def test_blocks(contig, gdna):
    query = next(kevlar.parse_augmented_fastx(kevlar.open(contig, 'r')))
    target = next(kevlar.parse_augmented_fastx(kevlar.open(gdna, 'r')))
    cigar, score = kevlar.align(target.sequence, query.sequence)
    tok = AlignmentTokenizer(query.sequence, target.sequence, cigar)
    for block in tok.blocks:
        assert block.type in ('M', 'D', 'I')
        if block.type in ('M', 'D'):
            assert len(block.target) == block.length
        else:
            assert block.target is None
        if block.type in ('M', 'I'):
            assert len(block.query) == block.length
        else:
            assert block.query is None
Beispiel #12
0
def test_assembly(cc, contig):
    filename = 'fml/cc{:d}.afq.gz'.format(cc)
    fh = kevlar.open(data_file(filename), 'r')
    reads = [r for r in kevlar.parse_augmented_fastx(fh)]
    contigs = [c for c in kevlar.assembly.fml_asm(reads)]
    assert len(contigs) == 1
    assert contigs[0] == contig
Beispiel #13
0
def test_call_homopolymer_filter_disabled():
    contigfile = data_file('homopolymer/12175-3parts.contigs.augfasta')
    contigstream = kevlar.parse_augmented_fastx(kevlar.open(contigfile, 'r'))
    partstream = kevlar.parse_partitioned_reads(contigstream)
    contigs = kevlar.call.load_contigs(partstream)

    gdnafile = data_file('homopolymer/12175-3parts.targets.fasta')
    gdnastream = kevlar.reference.load_refr_cutouts(kevlar.open(gdnafile, 'r'))
    partstream = kevlar.parse_partitioned_reads(gdnastream)
    targets = kevlar.call.load_contigs(partstream)

    prelimcalls = list()
    for partid in contigs:
        contiglist = contigs[partid]
        gdnalist = targets[partid]
        caller = kevlar.call.call(
            gdnalist, contiglist, partid=partid, homopolyfilt=False
        )
        prelimcalls.extend(list(caller))

    kid = kevlar.sketch.load(data_file('homopolymer/12175-kid.sct'))
    mom = kevlar.sketch.load(data_file('homopolymer/12175-mom.sct'))
    dad = kevlar.sketch.load(data_file('homopolymer/12175-dad.sct'))
    refr = kevlar.sketch.load(data_file('homopolymer/12175-refr.sct'))
    scorer = kevlar.simlike.simlike(
        prelimcalls, kid, [mom, dad], refr,
        samplelabels=['Proband', 'Mother', 'Father'],
    )
    calls = list(scorer)

    assert len(calls) == 6
    for c in calls:
        assert 'Homopolymer' not in c.filterstr
Beispiel #14
0
def test_partition_reader_simple():
    infile = kevlar.tests.data_file('part-reads-simple.fa')
    readstream = kevlar.parse_augmented_fastx(kevlar.open(infile, 'r'))
    partitions = [p for p in kevlar.parse_partitioned_reads(readstream)]
    assert len(partitions) == 2
    assert len(partitions[0]) == 4
    assert len(partitions[1]) == 2
Beispiel #15
0
def test_call_max_target_length(contigs, gdnas, maxtargetlen, numpassing):
    contigfile = data_file(contigs)
    contigstream = kevlar.parse_augmented_fastx(kevlar.open(contigfile, 'r'))
    partstream = kevlar.parse_partitioned_reads(contigstream)
    contigs = kevlar.call.load_contigs(partstream)

    gdnafile = data_file(gdnas)
    gdnastream = kevlar.reference.load_refr_cutouts(kevlar.open(gdnafile, 'r'))
    partstream = kevlar.parse_partitioned_reads(gdnastream)
    targets = kevlar.call.load_contigs(partstream)

    calls = list()
    for partid in contigs:
        contiglist = contigs[partid]
        gdnalist = targets[partid]
        caller = kevlar.call.call(
            gdnalist, contiglist, partid=partid, maxtargetlen=maxtargetlen
        )
        calls.extend(list(caller))

    nocalls = [c for c in calls if c.seqid == '.']
    passcalls = [c for c in calls if c.seqid != '.']
    assert len(passcalls) == numpassing
    for c in nocalls:
        assert c.seqid == c.position == '.'
        assert sorted(c.info.keys()) == ['CONTIG', 'IKMERS', 'PART']
Beispiel #16
0
def main(args):
    readstream = kevlar.parse_augmented_fastx(kevlar.open(args.infile, 'r'))
    if args.part_id:
        pstream = kevlar.parse_single_partition(readstream, args.part_id)
    else:
        pstream = kevlar.parse_partitioned_reads(readstream)
    outstream = kevlar.open(args.out, 'w')
    workflow = alac(pstream,
                    args.refr,
                    threads=args.threads,
                    ksize=args.ksize,
                    bigpart=args.bigpart,
                    delta=args.delta,
                    seedsize=args.seed_size,
                    maxdiff=args.max_diff,
                    match=args.match,
                    mismatch=args.mismatch,
                    gapopen=args.open,
                    gapextend=args.extend,
                    min_ikmers=args.min_ikmers,
                    logstream=args.logfile)

    writer = kevlar.vcf.VCFWriter(
        outstream,
        source='kevlar::alac',
        refr=args.refr,
    )
    writer.write_header()
    for varcall in workflow:
        writer.write(varcall)
Beispiel #17
0
def test_nocall():
    # Intentionally mismatched
    qfile = data_file('phony-deletion-01.contig.fa')
    tfile = data_file('phony-insertion-01.gdna.fa')

    qinstream = kevlar.parse_augmented_fastx(kevlar.open(qfile, 'r'))
    query = [record for record in qinstream][0]
    tinstream = kevlar.reference.load_refr_cutouts(kevlar.open(tfile, 'r'))
    target = [record for record in tinstream][0]

    aln = VariantMapping(query, target, 1e6, '25D5M22I5M46D8M13D2M35I')
    assert aln.offset is None
    assert aln.targetshort is None
    assert aln.match is None
    assert aln.leftflank is None
    assert aln.indel is None
    assert aln.indeltype is None
    assert aln.rightflank is None

    variants = list(aln.call_variants(21))
    assert len(variants) == 1
    assert variants[0].vcf == (
        'yourchr\t801\t.\t.\t.\t.\tInscrutableCigar\t'
        'CIGAR=25D5M22I5M46D8M13D2M35I;KSW2=1000000.0;CONTIG=AACTGGTGGGCTCAAGA'
        'CTAAAAAGACTTTTTTGGTGACAAGCAGGGCGGCCTGCCCTTCCTGTAGTGCAAGAAAAT')
Beispiel #18
0
def test_alac_bigpart():
    readfile = data_file('fiveparts.augfastq.gz')
    refrfile = data_file('fiveparts-refr.fa.gz')
    readstream = kevlar.parse_augmented_fastx(kevlar.open(readfile, 'r'))
    partstream = kevlar.parse_partitioned_reads(readstream)
    calls = list(kevlar.alac.alac(partstream, refrfile, maxreads=20))
    assert len(calls) == 3
Beispiel #19
0
def main(args):
    if args.split:
        kevlar.mkdirp(args.split, trim=True)
    outstream = None if args.split else kevlar.open(args.out, 'w')
    readstream = kevlar.parse_augmented_fastx(kevlar.open(args.infile, 'r'))
    partitioner = partition(readstream,
                            strict=args.strict,
                            minabund=args.min_abund,
                            maxabund=args.max_abund,
                            dedup=args.dedup,
                            gmlfile=args.gml,
                            logstream=args.logfile)
    partnum = 0
    numreads = 0
    for partnum, part in enumerate(partitioner, 1):
        numreads += len(part)
        if args.split:
            ofname = '{:s}.cc{:d}.augfastq.gz'.format(args.split, partnum)
            with kevlar.open(ofname, 'w') as outfile:
                for read in part:
                    kevlar.print_augmented_fastx(read, outfile)
        else:
            for read in part:
                read.name += ' kvcc={:d}'.format(partnum)
                kevlar.print_augmented_fastx(read, outstream)
    message = '[kevlar::partition] grouped {:d} reads'.format(numreads)
    message += ' into {:d} connected components'.format(partnum)
    print(message, file=args.logfile)
Beispiel #20
0
def test_assembly_edgeless(cc):
    filename = 'edgeless/cc{:d}.afq.gz'.format(cc)
    fh = kevlar.open(data_file(filename), 'r')
    reads = [r for r in kevlar.parse_augmented_fastx(fh)]
    contigs = [c for c in kevlar.assembly.fml_asm(reads)]
    assert len(contigs) == 0
    with pytest.raises(kevlar.assemble.KevlarEdgelessGraphError):
        contigs = [c for c in kevlar.assemble.assemble_greedy(reads)]
Beispiel #21
0
def main(args):
    contigstream = kevlar.parse_augmented_fastx(kevlar.open(args.contigs, 'r'))
    outstream = kevlar.open(args.out, 'w')
    for record in localize(contigstream,
                           args.refr,
                           ksize=args.ksize,
                           delta=args.delta):
        khmer.utils.write_record(record, outstream)
Beispiel #22
0
def test_alac_single_partition(label, position):
    readfile = data_file('fiveparts.augfastq.gz')
    refrfile = data_file('fiveparts-refr.fa.gz')
    readstream = kevlar.parse_augmented_fastx(kevlar.open(readfile, 'r'))
    partstream = kevlar.parse_single_partition(readstream, label)
    calls = list(kevlar.alac.alac(partstream, refrfile))
    assert len(calls) == 1
    assert calls[0].position == position - 1
    assert calls[0].attribute('PART') == label
Beispiel #23
0
def test_no_reference_match(capsys):
    readfile = data_file('pico-4.augfastq.gz')
    reads = kevlar.parse_augmented_fastx(kevlar.open(readfile, 'r'))
    partitions = kevlar.parse_partitioned_reads(reads)
    refr = data_file('localize-refr.fa')
    baldwin = kevlar.alac.alac(partitions, refr)
    calls = list(baldwin)
    out, err = capsys.readouterr()
    assert 'WARNING: no reference matches' in err
Beispiel #24
0
def main_greedy(args):
    readstream = kevlar.parse_augmented_fastx(kevlar.open(args.augfastq, 'r'))
    outstream = None  # Only create output file if there are contigs
    contigstream = assemble_greedy(readstream, args.gml, args.debug,
                                   args.logfile)
    for contig in contigstream:
        if outstream is None:
            outstream = kevlar.open(args.out, 'w')
        kevlar.print_augmented_fastx(contig, outstream)
Beispiel #25
0
def test_localize_no_match(capsys):
    refr_file = data_file('fiveparts-refr.fa.gz')
    contig_file = data_file('wasp-pass.contig.augfasta')
    contigstream = kevlar.parse_augmented_fastx(kevlar.open(contig_file, 'r'))
    pstream = kevlar.parse_partitioned_reads(contigstream)
    localizer = localize(pstream, refr_file, seedsize=41, debug=True)
    cutoutdata = list(localizer)
    assert cutoutdata == []
    out, err = capsys.readouterr()
    assert 'WARNING: no reference matches' in err
Beispiel #26
0
def test_kevlar_open(basename):
    infilename = kevlar.tests.data_file(basename)
    infile = kevlar.open(infilename, 'r')
    record = next(kevlar.parse_augmented_fastx(infile))

    assert record.name == 'ERR894724.125497791/1'
    assert record.sequence == (
        'TAGCCAGTTTGGGTAATTTTAATTGTAAAACTTTTTTTTCTTTTTTTTTGATTTTTTTTTTTCAAGCAG'
        'AAGACGGCATACGAGCTCTTTTCACGTGACTGGAGTTCAGACGTGTGCTCTTCCGAT')
    assert len(record.ikmers) == 2
Beispiel #27
0
def test_call_near_end(query, target, dist, n, trimcount):
    contig = next(
        kevlar.parse_augmented_fastx(kevlar.open(data_file(query), 'r')))
    cutout = next(
        kevlar.reference.load_refr_cutouts(kevlar.open(data_file(target),
                                                       'r')))
    aln = VariantMapping(contig, cutout)
    calls = list(aln.call_variants(31, mindist=dist))
    assert len(calls) == n
    assert aln.trimmed == trimcount
Beispiel #28
0
def main(args):
    readstream = kevlar.parse_augmented_fastx(kevlar.open(args.augfastq, 'r'))
    if args.part_id:
        pstream = kevlar.parse_single_partition(readstream, args.part_id)
    else:
        pstream = kevlar.parse_partitioned_reads(readstream)
    outstream = kevlar.open(args.out, 'w')
    assembler = assemble(pstream, maxreads=args.max_reads)
    for partid, contig in assembler:
        kevlar.print_augmented_fastx(contig, outstream)
Beispiel #29
0
def main(args):
    partfile = kevlar.open(args.infile, 'r')
    readstream = kevlar.parse_augmented_fastx(partfile)
    partstream = kevlar.parse_partitioned_reads(readstream)
    outstreams = list()
    for i in range(args.numfiles):
        outfile = '{:s}.{:d}'.format(args.base, i + 1)
        os = kevlar.open(outfile, 'w')
        outstreams.append(os)
    split(partstream, outstreams)
Beispiel #30
0
def test_parse_single_partition():
    infile = data_file('part-reads-simple.fa')

    readstream = kevlar.parse_augmented_fastx(kevlar.open(infile, 'r'))
    partitions = list(kevlar.parse_single_partition(readstream, '1'))
    partitions = [part for partid, part in partitions]
    assert len(partitions) == 1
    assert len(partitions[0]) == 4

    readstream = kevlar.parse_augmented_fastx(kevlar.open(infile, 'r'))
    partitions = list(kevlar.parse_single_partition(readstream, '2'))
    partitions = [part for partid, part in partitions]
    assert len(partitions) == 1
    assert len(partitions[0]) == 2

    readstream = kevlar.parse_augmented_fastx(kevlar.open(infile, 'r'))
    partitions = list(kevlar.parse_single_partition(readstream, 'alFrED'))
    partitions = [part for partid, part in partitions]
    assert partitions == []