Example #1
0
def test_novel_output_has_mates():
    kid = data_file('microtrios/trio-na-proband.fq.gz')
    mom = data_file('microtrios/trio-na-mother.fq.gz')
    dad = data_file('microtrios/trio-na-father.fq.gz')
    testnovel = data_file('microtrios/novel-na.augfastq.gz')

    with NamedTemporaryFile(suffix='.augfastq') as novelfile:
        arglist = [
            'novel', '--out', novelfile.name, '--case', kid, '--case-min', '5',
            '--control', mom, '--control', dad, '--ctrl-max', '1', '--memory',
            '500K'
        ]
        args = kevlar.cli.parser().parse_args(arglist)
        kevlar.novel.main(args)

        intread_ids = set()
        mate_seqs = set()
        stream = kevlar.parse_augmented_fastx(kevlar.open(novelfile.name, 'r'))
        for read in stream:
            intread_ids.add(read.name)
            mate_seqs.update(read.mates)

        stream = kevlar.parse_augmented_fastx(kevlar.open(testnovel, 'r'))
        test_ids = set([r.name for r in stream])
        assert intread_ids == test_ids

        stream = kevlar.parse_augmented_fastx(kevlar.open(testnovel, 'r'))
        test_mate_seqs = set([m for r in stream for m in r.mates])
        assert mate_seqs == test_mate_seqs
Example #2
0
def main(args):
    readstream = kevlar.parse_augmented_fastx(kevlar.open(args.infile, 'r'))
    if args.part_id:
        pstream = kevlar.parse_single_partition(readstream, args.part_id)
    else:
        pstream = kevlar.parse_partitioned_reads(readstream)
    outstream = kevlar.open(args.out, 'w')
    workflow = alac(pstream,
                    args.refr,
                    threads=args.threads,
                    ksize=args.ksize,
                    bigpart=args.bigpart,
                    delta=args.delta,
                    seedsize=args.seed_size,
                    maxdiff=args.max_diff,
                    match=args.match,
                    mismatch=args.mismatch,
                    gapopen=args.open,
                    gapextend=args.extend,
                    min_ikmers=args.min_ikmers,
                    logstream=args.logfile)

    writer = kevlar.vcf.VCFWriter(
        outstream,
        source='kevlar::alac',
        refr=args.refr,
    )
    writer.write_header()
    for varcall in workflow:
        writer.write(varcall)
Example #3
0
def test_call_max_target_length(contigs, gdnas, maxtargetlen, numpassing):
    contigfile = data_file(contigs)
    contigstream = kevlar.parse_augmented_fastx(kevlar.open(contigfile, 'r'))
    partstream = kevlar.parse_partitioned_reads(contigstream)
    contigs = kevlar.call.load_contigs(partstream)

    gdnafile = data_file(gdnas)
    gdnastream = kevlar.reference.load_refr_cutouts(kevlar.open(gdnafile, 'r'))
    partstream = kevlar.parse_partitioned_reads(gdnastream)
    targets = kevlar.call.load_contigs(partstream)

    calls = list()
    for partid in contigs:
        contiglist = contigs[partid]
        gdnalist = targets[partid]
        caller = kevlar.call.call(
            gdnalist, contiglist, partid=partid, maxtargetlen=maxtargetlen
        )
        calls.extend(list(caller))

    nocalls = [c for c in calls if c.seqid == '.']
    passcalls = [c for c in calls if c.seqid != '.']
    assert len(passcalls) == numpassing
    for c in nocalls:
        assert c.seqid == c.position == '.'
        assert sorted(c.info.keys()) == ['CONTIG', 'IKMERS', 'PART']
Example #4
0
def main(args):
    if args.split:
        kevlar.mkdirp(args.split, trim=True)
    outstream = None if args.split else kevlar.open(args.out, 'w')
    readstream = kevlar.parse_augmented_fastx(kevlar.open(args.infile, 'r'))
    partitioner = partition(readstream,
                            strict=args.strict,
                            minabund=args.min_abund,
                            maxabund=args.max_abund,
                            dedup=args.dedup,
                            gmlfile=args.gml,
                            logstream=args.logfile)
    partnum = 0
    numreads = 0
    for partnum, part in enumerate(partitioner, 1):
        numreads += len(part)
        if args.split:
            ofname = '{:s}.cc{:d}.augfastq.gz'.format(args.split, partnum)
            with kevlar.open(ofname, 'w') as outfile:
                for read in part:
                    kevlar.print_augmented_fastx(read, outfile)
        else:
            for read in part:
                read.name += ' kvcc={:d}'.format(partnum)
                kevlar.print_augmented_fastx(read, outstream)
    message = '[kevlar::partition] grouped {:d} reads'.format(numreads)
    message += ' into {:d} connected components'.format(partnum)
    print(message, file=args.logfile)
Example #5
0
def test_split_cli():
    infile = data_file('fiveparts.augfastq.gz')
    tempdir = mkdtemp()
    print(tempdir)
    arglist = ['split', infile, '3', tempdir + '/out']
    args = kevlar.cli.parser().parse_args(arglist)
    kevlar.split.main(args)

    outfile = tempdir + '/out.0.augfastx.gz'
    readstream = kevlar.parse_augmented_fastx(kevlar.open(outfile, 'r'))
    partstream = kevlar.parse_partitioned_reads(readstream)
    partitions = list(partstream)
    partitions = [part for partid, part in partitions]
    assert len(partitions) == 2
    assert len(partitions[0]) == 67
    assert len(partitions[1]) == 12

    outfile = tempdir + '/out.1.augfastx.gz'
    readstream = kevlar.parse_augmented_fastx(kevlar.open(outfile, 'r'))
    partstream = kevlar.parse_partitioned_reads(readstream)
    partitions = list(partstream)
    partitions = [part for partid, part in partitions]
    assert len(partitions) == 2
    assert len(partitions[0]) == 23
    assert len(partitions[1]) == 11

    outfile = tempdir + '/out.2.augfastx.gz'
    readstream = kevlar.parse_augmented_fastx(kevlar.open(outfile, 'r'))
    partstream = kevlar.parse_partitioned_reads(readstream)
    partitions = list(partstream)
    partitions = [part for partid, part in partitions]
    assert len(partitions) == 1
    assert len(partitions[0]) == 15

    rmtree(tempdir)
Example #6
0
def test_call_homopolymer_filter_disabled():
    contigfile = data_file('homopolymer/12175-3parts.contigs.augfasta')
    contigstream = kevlar.parse_augmented_fastx(kevlar.open(contigfile, 'r'))
    partstream = kevlar.parse_partitioned_reads(contigstream)
    contigs = kevlar.call.load_contigs(partstream)

    gdnafile = data_file('homopolymer/12175-3parts.targets.fasta')
    gdnastream = kevlar.reference.load_refr_cutouts(kevlar.open(gdnafile, 'r'))
    partstream = kevlar.parse_partitioned_reads(gdnastream)
    targets = kevlar.call.load_contigs(partstream)

    prelimcalls = list()
    for partid in contigs:
        contiglist = contigs[partid]
        gdnalist = targets[partid]
        caller = kevlar.call.call(
            gdnalist, contiglist, partid=partid, homopolyfilt=False
        )
        prelimcalls.extend(list(caller))

    kid = kevlar.sketch.load(data_file('homopolymer/12175-kid.sct'))
    mom = kevlar.sketch.load(data_file('homopolymer/12175-mom.sct'))
    dad = kevlar.sketch.load(data_file('homopolymer/12175-dad.sct'))
    refr = kevlar.sketch.load(data_file('homopolymer/12175-refr.sct'))
    scorer = kevlar.simlike.simlike(
        prelimcalls, kid, [mom, dad], refr,
        samplelabels=['Proband', 'Mother', 'Father'],
    )
    calls = list(scorer)

    assert len(calls) == 6
    for c in calls:
        assert 'Homopolymer' not in c.filterstr
Example #7
0
def main(args):
    augseqs = kevlar.parse_augmented_fastx(kevlar.open(args.augseqs, 'r'))
    nakedseqs = kevlar.parse_augmented_fastx(kevlar.open(args.seqs, 'r'))
    outstream = kevlar.open(args.out, 'w')
    docollapse = args.collapse_mates
    for record in augment(augseqs, nakedseqs, collapsemates=docollapse):
        kevlar.print_augmented_fastx(record, outstream)
Example #8
0
def test_nocall():
    # Intentionally mismatched
    qfile = data_file('phony-deletion-01.contig.fa')
    tfile = data_file('phony-insertion-01.gdna.fa')

    qinstream = kevlar.parse_augmented_fastx(kevlar.open(qfile, 'r'))
    query = [record for record in qinstream][0]
    tinstream = kevlar.reference.load_refr_cutouts(kevlar.open(tfile, 'r'))
    target = [record for record in tinstream][0]

    aln = VariantMapping(query, target, 1e6, '25D5M22I5M46D8M13D2M35I')
    assert aln.offset is None
    assert aln.targetshort is None
    assert aln.match is None
    assert aln.leftflank is None
    assert aln.indel is None
    assert aln.indeltype is None
    assert aln.rightflank is None

    variants = list(aln.call_variants(21))
    assert len(variants) == 1
    assert variants[0].vcf == (
        'yourchr\t801\t.\t.\t.\t.\tInscrutableCigar\t'
        'CIGAR=25D5M22I5M46D8M13D2M35I;KSW2=1000000.0;CONTIG=AACTGGTGGGCTCAAGA'
        'CTAAAAAGACTTTTTTGGTGACAAGCAGGGCGGCCTGCCCTTCCTGTAGTGCAAGAAAAT')
Example #9
0
def main(args):
    print('[kevlar::mutate] loading mutations', file=args.logfile)
    mutations = load_mutations(kevlar.open(args.mutations, 'r'), args.logfile)

    print('[kevlar::mutate] mutating genome', file=args.logfile)
    for record in mutate_genome(args.genome, mutations):
        write_record(record, kevlar.open(args.out, 'w'))
Example #10
0
def test_augment_contig_mates():
    augfh = kevlar.open(data_file('deadbeef.augfastq.gz'), 'r')
    augreads = kevlar.parse_augmented_fastx(augfh)
    nakedfh = kevlar.open(data_file('deadbeef.contig.fa'), 'r')
    nakedseq = kevlar.parse_augmented_fastx(nakedfh)
    contigs = list(augment(augreads, nakedseq))
    assert len(contigs) == 1
    assert len(contigs[0].annotations) == 74
Example #11
0
def main(args):
    contigstream = kevlar.parse_augmented_fastx(kevlar.open(args.contigs, 'r'))
    outstream = kevlar.open(args.out, 'w')
    for record in localize(contigstream,
                           args.refr,
                           ksize=args.ksize,
                           delta=args.delta):
        khmer.utils.write_record(record, outstream)
Example #12
0
def main(args):
    reader = kevlar.vcf.vcfstream(args.vcf)
    bedstream = kevlar.parse_bed(kevlar.open(args.filt, 'r'))
    outstream = kevlar.open(args.out, 'w')
    writer = kevlar.vcf.VCFWriter(outstream, source='kevlar::varfilter')
    writer.write_header()
    for varcall in varfilter(reader, bedstream):
        writer.write(varcall)
Example #13
0
def test_suffix():
    bamstream = kevlar.open(data_file('nopair.sam'), 'r')
    refrstream = kevlar.open(data_file('bogus-genome/refr.fa'), 'r')
    refr = kevlar.seqio.parse_seq_dict(refrstream)

    records = [r for r in kevlar.dump.dump(bamstream, refr)]
    assert len(records) == 1
    assert records[0].name.endswith('/1') or records[0].name.endswith('/2')
Example #14
0
def test_kevlar_open():
    thefile = kevlar.tests.data_file('wasp-pass.contig.augfasta')
    filehandle = kevlar.open(thefile, 'r')
    filecontents = filehandle.read()
    assert len(filecontents.strip().split('\n')) == 9

    with pytest.raises(ValueError, match=r'invalid mode "p"'):
        filehandle = kevlar.open(thefile, 'p')
Example #15
0
def test_nomargin():
    qfile = kevlar.open(data_file('nomargin-r-indel-contigs.augfasta'), 'r')
    tfile = kevlar.open(data_file('nomargin-r-gdna.fa'), 'r')
    query = next(kevlar.parse_augmented_fastx(qfile))
    target = next(kevlar.parse_augmented_fastx(tfile))
    cigar, score = kevlar.align(target.sequence, query.sequence)
    tok = AlignmentTokenizer(query.sequence, target.sequence, cigar)
    assert tok._cigar == tok._origcigar
Example #16
0
def test_compact():
    bedstream = kevlar.open(data_file('compact-test-refr.bed.gz'), 'r')
    index = kevlar.evaluate.populate_index_from_bed(bedstream)

    vcfstream = kevlar.open(data_file('compact-test-pred.vcf.gz'), 'r')
    reader = kevlar.vcf.VCFReader(vcfstream)
    compactor = kevlar.evaluate.compact(reader, index, delta=10)
    calls = list(compactor)
    assert len(calls) == 33
Example #17
0
def test_augment_reads_mates():
    augfh = kevlar.open(data_file('deadbeef.augfastq.gz'), 'r')
    augreads = list(kevlar.parse_augmented_fastx(augfh))
    nakedfh = kevlar.open(data_file('deadbeef.fq.gz'), 'r')
    nakedseq = kevlar.parse_augmented_fastx(nakedfh)
    newreads = list(augment(augreads, nakedseq, upint=5))
    for oldread, newread in zip(augreads, newreads):
        assert oldread.sequence == newread.sequence
        assert oldread.annotations == newread.annotations
Example #18
0
def main(args):
    fastq = kevlar.open(args.out, 'w')
    refr = None
    if args.refr:
        print('[kevlar::dump] Loading reference sequence', file=args.logfile)
        refrstream = kevlar.open(args.refr, 'r')
        refr = kevlar.seqio.parse_seq_dict(refrstream)
    for read in dump(args.reads, refr, args.pair_mode, logstream=args.logfile):
        write_record(read, fastq)
Example #19
0
def main_greedy(args):
    readstream = kevlar.parse_augmented_fastx(kevlar.open(args.augfastq, 'r'))
    outstream = None  # Only create output file if there are contigs
    contigstream = assemble_greedy(readstream, args.gml, args.debug,
                                   args.logfile)
    for contig in contigstream:
        if outstream is None:
            outstream = kevlar.open(args.out, 'w')
        kevlar.print_augmented_fastx(contig, outstream)
Example #20
0
def test_gap_center_aligned(contig, gdna, newcigar, origcigar, nblocks):
    qfile = kevlar.open(data_file('cigar/' + contig), 'r')
    tfile = kevlar.open(data_file('cigar/' + gdna), 'r')
    query = next(kevlar.parse_augmented_fastx(qfile))
    target = next(kevlar.parse_augmented_fastx(tfile))
    cigar, score = kevlar.align(target.sequence, query.sequence)
    tok = AlignmentTokenizer(query.sequence, target.sequence, cigar)
    assert len(tok.blocks) == nblocks
    assert tok._cigar == newcigar
    assert tok._origcigar == origcigar
Example #21
0
def main(args):
    readstream = kevlar.parse_augmented_fastx(kevlar.open(args.augfastq, 'r'))
    if args.part_id:
        pstream = kevlar.parse_single_partition(readstream, args.part_id)
    else:
        pstream = kevlar.parse_partitioned_reads(readstream)
    outstream = kevlar.open(args.out, 'w')
    assembler = assemble(pstream, maxreads=args.max_reads)
    for partid, contig in assembler:
        kevlar.print_augmented_fastx(contig, outstream)
Example #22
0
def test_call_near_end(query, target, dist, n, trimcount):
    contig = next(
        kevlar.parse_augmented_fastx(kevlar.open(data_file(query), 'r')))
    cutout = next(
        kevlar.reference.load_refr_cutouts(kevlar.open(data_file(target),
                                                       'r')))
    aln = VariantMapping(contig, cutout)
    calls = list(aln.call_variants(31, mindist=dist))
    assert len(calls) == n
    assert aln.trimmed == trimcount
Example #23
0
def main(args):
    partfile = kevlar.open(args.infile, 'r')
    readstream = kevlar.parse_augmented_fastx(partfile)
    partstream = kevlar.parse_partitioned_reads(readstream)
    outstreams = list()
    for i in range(args.numfiles):
        outfile = '{:s}.{:d}'.format(args.base, i + 1)
        os = kevlar.open(outfile, 'w')
        outstreams.append(os)
    split(partstream, outstreams)
Example #24
0
def test_call_num_interesting_kmers():
    contig = next(
        kevlar.parse_augmented_fastx(
            kevlar.open(data_file('iktest.contig.fa'), 'r')))
    cutout = next(
        kevlar.reference.load_refr_cutouts(
            kevlar.open(data_file('iktest.gdna.fa'), 'r')))
    aln = VariantMapping(contig, cutout)
    calls = list(aln.call_variants(29))
    assert len(calls) == 1
    assert calls[0].attribute('IKMERS') == '1'
Example #25
0
def test_varfilter_single():
    bedstream = kevlar.parse_bed(
        kevlar.open(data_file('fiveparts-ignore-single.bed'), 'r'))
    vcffile = data_file('five-snvs-with-likelihood.vcf')
    with kevlar.open(vcffile, 'r') as vcfstream:
        reader = kevlar.vcf.VCFReader(vcfstream)
        varcalls = list(kevlar.varfilter.varfilter(reader, bedstream))
    assert len(varcalls) == 5
    filtered = [vc for vc in varcalls if vc.filterstr != 'PASS']
    assert len(filtered) == 1
    assert filtered[0].position == 36385017
Example #26
0
def test_augment_contigs():
    augfh = kevlar.open(data_file('snorkel.augfastq'), 'r')
    augreads = kevlar.parse_augmented_fastx(augfh)
    nakedfh = kevlar.open(data_file('snorkel-contig.fasta'), 'r')
    nakedseq = kevlar.parse_augmented_fastx(nakedfh)
    augseqs = list(augment(augreads, nakedseq))
    assert len(augseqs) == 1
    assert len(augseqs[0].annotations) == 3

    offsets = [k.offset for k in augseqs[0].annotations]
    assert offsets == [17, 20, 22]
Example #27
0
def filter(readfile, mask=None, memory=1e6, maxfpr=0.01, casemin=6, ctrlmax=1):
    timer = kevlar.Timer()
    timer.start()
    reader = kevlar.parse_augmented_fastx(kevlar.open(readfile, 'r'))
    counts = first_pass(reader, mask, memory, timer)
    check_fpr(counts, maxfpr)
    reader = kevlar.parse_augmented_fastx(kevlar.open(readfile, 'r'))
    for read in second_pass(reader, counts, casemin, ctrlmax, timer):
        yield read
    total = timer.stop()
    message = 'Total time: {:.2f} seconds'.format(total)
    kevlar.plog('[kevlar::filter]', message)
Example #28
0
def test_call_pico_indel(ccid, varcall):
    qfile = data_file('pico' + ccid + '.contig.augfasta')
    tfile = data_file('pico' + ccid + '.gdna.fa')

    qinstream = kevlar.parse_augmented_fastx(kevlar.open(qfile, 'r'))
    queries = [record for record in qinstream]
    tinstream = kevlar.reference.load_refr_cutouts(kevlar.open(tfile, 'r'))
    targets = [record for record in tinstream]

    calls = list(call(targets, queries))
    assert len(calls) == 1
    assert str(calls[0]) == varcall
Example #29
0
def test_gap_center_aligned():
    query = next(
        kevlar.parse_augmented_fastx(
            kevlar.open(data_file('cigar/b.contig.fa'), 'r')))
    target = next(
        kevlar.parse_augmented_fastx(
            kevlar.open(data_file('cigar/b.gdna.fa'), 'r')))
    cigar, score = kevlar.align(target.sequence, query.sequence)
    tok = AlignmentTokenizer(query.sequence, target.sequence, cigar)
    assert len(tok.blocks) == 3
    assert tok._cigar == '41D150M50D'
    assert tok._origcigar == '41D144M50D6M'
Example #30
0
def main(args):
    contigstream = kevlar.parse_augmented_fastx(kevlar.open(args.contigs, 'r'))
    outstream = kevlar.open(args.out, 'w')
    localizer = localize(contigstream,
                         args.refr,
                         seedsize=args.seed_size,
                         delta=args.delta,
                         maxdiff=args.max_diff,
                         logstream=args.logfile)
    for cutout in localizer:
        record = Record(name=cutout.defline, sequence=cutout.sequence)
        kevlar.sequence.write_record(record, outstream)