コード例 #1
0
def test_call_homopolymer_filter_disabled():
    contigfile = data_file('homopolymer/12175-3parts.contigs.augfasta')
    contigstream = kevlar.parse_augmented_fastx(kevlar.open(contigfile, 'r'))
    partstream = kevlar.parse_partitioned_reads(contigstream)
    contigs = kevlar.call.load_contigs(partstream)

    gdnafile = data_file('homopolymer/12175-3parts.targets.fasta')
    gdnastream = kevlar.reference.load_refr_cutouts(kevlar.open(gdnafile, 'r'))
    partstream = kevlar.parse_partitioned_reads(gdnastream)
    targets = kevlar.call.load_contigs(partstream)

    prelimcalls = list()
    for partid in contigs:
        contiglist = contigs[partid]
        gdnalist = targets[partid]
        caller = kevlar.call.call(
            gdnalist, contiglist, partid=partid, homopolyfilt=False
        )
        prelimcalls.extend(list(caller))

    kid = kevlar.sketch.load(data_file('homopolymer/12175-kid.sct'))
    mom = kevlar.sketch.load(data_file('homopolymer/12175-mom.sct'))
    dad = kevlar.sketch.load(data_file('homopolymer/12175-dad.sct'))
    refr = kevlar.sketch.load(data_file('homopolymer/12175-refr.sct'))
    scorer = kevlar.simlike.simlike(
        prelimcalls, kid, [mom, dad], refr,
        samplelabels=['Proband', 'Mother', 'Father'],
    )
    calls = list(scorer)

    assert len(calls) == 6
    for c in calls:
        assert 'Homopolymer' not in c.filterstr
コード例 #2
0
def test_call_max_target_length(contigs, gdnas, maxtargetlen, numpassing):
    contigfile = data_file(contigs)
    contigstream = kevlar.parse_augmented_fastx(kevlar.open(contigfile, 'r'))
    partstream = kevlar.parse_partitioned_reads(contigstream)
    contigs = kevlar.call.load_contigs(partstream)

    gdnafile = data_file(gdnas)
    gdnastream = kevlar.reference.load_refr_cutouts(kevlar.open(gdnafile, 'r'))
    partstream = kevlar.parse_partitioned_reads(gdnastream)
    targets = kevlar.call.load_contigs(partstream)

    calls = list()
    for partid in contigs:
        contiglist = contigs[partid]
        gdnalist = targets[partid]
        caller = kevlar.call.call(
            gdnalist, contiglist, partid=partid, maxtargetlen=maxtargetlen
        )
        calls.extend(list(caller))

    nocalls = [c for c in calls if c.seqid == '.']
    passcalls = [c for c in calls if c.seqid != '.']
    assert len(passcalls) == numpassing
    for c in nocalls:
        assert c.seqid == c.position == '.'
        assert sorted(c.info.keys()) == ['CONTIG', 'IKMERS', 'PART']
コード例 #3
0
def test_split_cli():
    infile = data_file('fiveparts.augfastq.gz')
    tempdir = mkdtemp()
    print(tempdir)
    arglist = ['split', infile, '3', tempdir + '/out']
    args = kevlar.cli.parser().parse_args(arglist)
    kevlar.split.main(args)

    outfile = tempdir + '/out.0.augfastx.gz'
    readstream = kevlar.parse_augmented_fastx(kevlar.open(outfile, 'r'))
    partstream = kevlar.parse_partitioned_reads(readstream)
    partitions = list(partstream)
    partitions = [part for partid, part in partitions]
    assert len(partitions) == 2
    assert len(partitions[0]) == 67
    assert len(partitions[1]) == 12

    outfile = tempdir + '/out.1.augfastx.gz'
    readstream = kevlar.parse_augmented_fastx(kevlar.open(outfile, 'r'))
    partstream = kevlar.parse_partitioned_reads(readstream)
    partitions = list(partstream)
    partitions = [part for partid, part in partitions]
    assert len(partitions) == 2
    assert len(partitions[0]) == 23
    assert len(partitions[1]) == 11

    outfile = tempdir + '/out.2.augfastx.gz'
    readstream = kevlar.parse_augmented_fastx(kevlar.open(outfile, 'r'))
    partstream = kevlar.parse_partitioned_reads(readstream)
    partitions = list(partstream)
    partitions = [part for partid, part in partitions]
    assert len(partitions) == 1
    assert len(partitions[0]) == 15

    rmtree(tempdir)
コード例 #4
0
ファイル: test_seqio.py プロジェクト: jchow32/kevlar
def test_partition_reader_simple():
    infile = kevlar.tests.data_file('part-reads-simple.fa')
    readstream = kevlar.parse_augmented_fastx(kevlar.open(infile, 'r'))
    partitions = [p for p in kevlar.parse_partitioned_reads(readstream)]
    assert len(partitions) == 2
    assert len(partitions[0]) == 4
    assert len(partitions[1]) == 2
コード例 #5
0
ファイル: test_alac.py プロジェクト: johnsmith2077/kevlar
def test_alac_bigpart():
    readfile = data_file('fiveparts.augfastq.gz')
    refrfile = data_file('fiveparts-refr.fa.gz')
    readstream = kevlar.parse_augmented_fastx(kevlar.open(readfile, 'r'))
    partstream = kevlar.parse_partitioned_reads(readstream)
    calls = list(kevlar.alac.alac(partstream, refrfile, maxreads=20))
    assert len(calls) == 3
コード例 #6
0
def main(args):
    readstream = kevlar.parse_augmented_fastx(kevlar.open(args.infile, 'r'))
    if args.part_id:
        pstream = kevlar.parse_single_partition(readstream, args.part_id)
    else:
        pstream = kevlar.parse_partitioned_reads(readstream)
    outstream = kevlar.open(args.out, 'w')
    workflow = alac(pstream,
                    args.refr,
                    threads=args.threads,
                    ksize=args.ksize,
                    bigpart=args.bigpart,
                    delta=args.delta,
                    seedsize=args.seed_size,
                    maxdiff=args.max_diff,
                    match=args.match,
                    mismatch=args.mismatch,
                    gapopen=args.open,
                    gapextend=args.extend,
                    min_ikmers=args.min_ikmers,
                    logstream=args.logfile)

    writer = kevlar.vcf.VCFWriter(
        outstream,
        source='kevlar::alac',
        refr=args.refr,
    )
    writer.write_header()
    for varcall in workflow:
        writer.write(varcall)
コード例 #7
0
def test_call_homopolymers_mixed_results():
    contigfile = data_file('homopolymer/12175-3parts.contigs.augfasta')
    contigstream = kevlar.parse_augmented_fastx(kevlar.open(contigfile, 'r'))
    partstream = kevlar.parse_partitioned_reads(contigstream)
    contigs = kevlar.call.load_contigs(partstream)

    gdnafile = data_file('homopolymer/12175-3parts.targets.fasta')
    gdnastream = kevlar.reference.load_refr_cutouts(kevlar.open(gdnafile, 'r'))
    partstream = kevlar.parse_partitioned_reads(gdnastream)
    targets = kevlar.call.load_contigs(partstream)

    prelimcalls = list()
    for partid in contigs:
        contiglist = contigs[partid]
        gdnalist = targets[partid]
        caller = kevlar.call.call(gdnalist, contiglist, partid=partid)
        prelimcalls.extend(list(caller))

    kid = kevlar.sketch.load(data_file('homopolymer/12175-kid.sct'))
    mom = kevlar.sketch.load(data_file('homopolymer/12175-mom.sct'))
    dad = kevlar.sketch.load(data_file('homopolymer/12175-dad.sct'))
    refr = kevlar.sketch.load(data_file('homopolymer/12175-refr.sct'))
    scorer = kevlar.simlike.simlike(
        prelimcalls, kid, [mom, dad], refr,
        samplelabels=['Proband', 'Mother', 'Father'],
    )
    calls = list(scorer)

    assert len(calls) == 6
    for c in calls:
        print(c.vcf)
    unintrstng = [c for c in calls if c.filterstr in ('PASS', 'Homopolymer')]
    assert len(unintrstng) == 3

    call1, call2, call3 = unintrstng
    assert call1.position == 123651924
    assert call1.filterstr == 'PASS'  # negative control
    assert call1._refr == 'TAA'
    assert call1._alt == 'T'
    assert call2.position == 124641259
    assert call2.filterstr == 'PASS'  # borderline
    assert call2._refr == 'TAAA'
    assert call2._alt == 'T'
    assert call3.position == 128660727
    assert call3.filterstr == 'Homopolymer'  # positive control
コード例 #8
0
ファイル: test_alac.py プロジェクト: johnsmith2077/kevlar
def test_no_reference_match(capsys):
    readfile = data_file('pico-4.augfastq.gz')
    reads = kevlar.parse_augmented_fastx(kevlar.open(readfile, 'r'))
    partitions = kevlar.parse_partitioned_reads(reads)
    refr = data_file('localize-refr.fa')
    baldwin = kevlar.alac.alac(partitions, refr)
    calls = list(baldwin)
    out, err = capsys.readouterr()
    assert 'WARNING: no reference matches' in err
コード例 #9
0
def test_localize_no_match(capsys):
    refr_file = data_file('fiveparts-refr.fa.gz')
    contig_file = data_file('wasp-pass.contig.augfasta')
    contigstream = kevlar.parse_augmented_fastx(kevlar.open(contig_file, 'r'))
    pstream = kevlar.parse_partitioned_reads(contigstream)
    localizer = localize(pstream, refr_file, seedsize=41, debug=True)
    cutoutdata = list(localizer)
    assert cutoutdata == []
    out, err = capsys.readouterr()
    assert 'WARNING: no reference matches' in err
コード例 #10
0
def main(args):
    partfile = kevlar.open(args.infile, 'r')
    readstream = kevlar.parse_augmented_fastx(partfile)
    partstream = kevlar.parse_partitioned_reads(readstream)
    outstreams = list()
    for i in range(args.numfiles):
        outfile = '{:s}.{:d}'.format(args.base, i + 1)
        os = kevlar.open(outfile, 'w')
        outstreams.append(os)
    split(partstream, outstreams)
コード例 #11
0
def main(args):
    readstream = kevlar.parse_augmented_fastx(kevlar.open(args.augfastq, 'r'))
    if args.part_id:
        pstream = kevlar.parse_single_partition(readstream, args.part_id)
    else:
        pstream = kevlar.parse_partitioned_reads(readstream)
    outstream = kevlar.open(args.out, 'w')
    assembler = assemble(pstream, maxreads=args.max_reads)
    for partid, contig in assembler:
        kevlar.print_augmented_fastx(contig, outstream)
コード例 #12
0
ファイル: test_alac.py プロジェクト: johnsmith2077/kevlar
def test_pico_calls(cc, pos, ref, alt):
    reads = data_file('pico-var/cc{:d}.afq.gz'.format(cc))
    readstream = kevlar.parse_augmented_fastx(kevlar.open(reads, 'r'))
    pstream = kevlar.parse_partitioned_reads(readstream)
    refrfile = data_file('human-random-pico.fa.gz')
    caller = kevlar.alac.alac(pstream, refrfile, ksize=25, delta=50)
    calls = [v for v in caller]

    assert len(calls) == 1
    assert calls[0]._pos == pos
    assert calls[0]._refr == ref
    assert calls[0]._alt == alt
コード例 #13
0
def test_split():
    infile = data_file('fiveparts.augfastq.gz')
    partfile = kevlar.open(infile, 'r')
    readstream = kevlar.parse_augmented_fastx(partfile)
    partstream = kevlar.parse_partitioned_reads(readstream)
    outstreams = [StringIO(), StringIO(), StringIO()]
    kevlar.split.split(partstream, outstreams)

    assert 'kvcc=1' in outstreams[0].getvalue()
    assert 'kvcc=2' in outstreams[1].getvalue()
    assert 'kvcc=3' in outstreams[2].getvalue()
    assert 'kvcc=4' in outstreams[0].getvalue()
    assert 'kvcc=5' in outstreams[1].getvalue()
コード例 #14
0
def test_maxdiff(X, numtargets):
    contigstream = kevlar.parse_partitioned_reads(
        kevlar.parse_augmented_fastx(
            kevlar.open(data_file('maxdiff-contig.augfasta'), 'r')))
    refrfile = data_file('maxdiff-refr.fa.gz')
    targeter = kevlar.localize.localize(contigstream,
                                        refrfile,
                                        seedsize=51,
                                        delta=50,
                                        maxdiff=X)
    targets = [cutout for partid, cutout in targeter]
    print([t.defline for t in targets])
    assert len(targets) == numtargets
コード例 #15
0
ファイル: alac.py プロジェクト: jchow32/kevlar
def main(args):
    readstream = kevlar.parse_augmented_fastx(kevlar.open(args.infile, 'r'))
    pstream = kevlar.parse_partitioned_reads(readstream)
    outstream = kevlar.open(args.out, 'w')
    workflow = alac(
        pstream, args.refr, ksize=args.ksize, delta=args.delta,
        maxdiff=args.max_diff, match=args.match, mismatch=args.mismatch,
        gapopen=args.open, gapextend=args.extend, greedy=args.greedy,
        logstream=args.logfile
    )

    for varcall in workflow:
        print(varcall.vcf, file=outstream)
コード例 #16
0
ファイル: test_alac.py プロジェクト: johnsmith2077/kevlar
def test_ikmer_filter_python():
    """
    Smoke test for filtering based in number of supporting ikmers.

    Each partition in the data set has only 2 supporting interesting k-mers.
    The supplied reference file doesn't actually correspond to the reads, so if
    this test passes it's because the filtering worked correctly and the
    `localize` code is never invoked.
    """
    readfile = data_file('min_ikmers_filt.augfastq.gz')
    reads = kevlar.parse_augmented_fastx(kevlar.open(readfile, 'r'))
    parts = kevlar.parse_partitioned_reads(reads)
    refr = data_file('localize-refr.fa')
    calls = list(kevlar.alac.alac(parts, refr, ksize=31, min_ikmers=3))
コード例 #17
0
def test_localize_new():
    refr_file = data_file('fiveparts-refr.fa.gz')
    contig_file = data_file('fiveparts.contigs.augfasta.gz')
    contigstream = kevlar.parse_augmented_fastx(kevlar.open(contig_file, 'r'))
    pstream = kevlar.parse_partitioned_reads(contigstream)
    localizer = localize(pstream, refr_file, seedsize=51, debug=True)
    cutoutdata = list(localizer)
    partids = [partid for partid, gdna in cutoutdata]
    gdnas = [gdna for partid, gdna in cutoutdata]
    deflines = [g.defline for g in gdnas]
    assert partids == ['1', '1', '2', '3', '4', '5']
    assert sorted(deflines) == sorted([
        'seq1_284663-284950', 'seq1_1924681-1925049', 'seq1_1660589-1660884',
        'seq1_2315741-2316037', 'seq1_2321099-2321322', 'seq1_593102-593389'
    ])
コード例 #18
0
ファイル: test_alac.py プロジェクト: johnsmith2077/kevlar
def test_alac_generate_mask_lowmem(capsys):
    readfile = data_file('fiveparts.augfastq.gz')
    refrfile = data_file('fiveparts-refr.fa.gz')
    readstream = kevlar.parse_augmented_fastx(kevlar.open(readfile, 'r'))
    partstream = kevlar.parse_partitioned_reads(readstream)
    with NamedTemporaryFile(suffix='.nt') as maskfile:
        calls = list(
            kevlar.alac.alac(partstream,
                             refrfile,
                             maskfile=maskfile.name,
                             maskmem=100))
        assert len(calls) == 5
    out, err = capsys.readouterr()
    message = 'WARNING: mask FPR is 0.8065; exceeds user-specified limit'
    assert message in out or message in err
コード例 #19
0
ファイル: test_alac.py プロジェクト: johnsmith2077/kevlar
def test_alac_matedist():
    readfile = data_file('mate-dist/cc130.augfastq.gz')
    refrfile = data_file('mate-dist/cc130.refr.fa.gz')
    readstream = kevlar.parse_augmented_fastx(kevlar.open(readfile, 'r'))
    partstream = kevlar.parse_partitioned_reads(readstream)
    caller = kevlar.alac.alac(partstream,
                              refrfile,
                              ksize=31,
                              delta=50,
                              seedsize=51)
    calls = list(caller)
    assert len(calls) == 3
    passed = [c for c in calls if c.filterstr == 'PASS']
    assert len(passed) == 3
    assert sorted([c.position for c in passed]) == [1475, 115377, 127540]
コード例 #20
0
def test_alac_maxdiff(vcfposition, X, cigar):
    pstream = kevlar.parse_partitioned_reads(
        kevlar.parse_augmented_fastx(
            kevlar.open(data_file('maxdiff-reads.augfastq.gz'), 'r')))
    refrfile = data_file('maxdiff-refr.fa.gz')
    caller = kevlar.alac.alac(pstream,
                              refrfile,
                              ksize=31,
                              delta=50,
                              seedsize=51,
                              maxdiff=X)
    calls = list(caller)
    assert len(calls) == 1
    assert calls[0].cigar == cigar
    assert calls[0].position == vcfposition - 1
コード例 #21
0
ファイル: test_alac.py プロジェクト: johnsmith2077/kevlar
def test_alac_generate_mask():
    readfile = data_file('fiveparts.augfastq.gz')
    refrfile = data_file('fiveparts-refr.fa.gz')
    readstream = kevlar.parse_augmented_fastx(kevlar.open(readfile, 'r'))
    partstream = kevlar.parse_partitioned_reads(readstream)
    with NamedTemporaryFile(suffix='.nt') as maskfile:
        calls = list(
            kevlar.alac.alac(partstream,
                             refrfile,
                             maskfile=maskfile.name,
                             maskmem=1e6))
        assert len(calls) == 5
        for c in calls:
            print(c.vcf)
        testfilename = data_file('fiveparts-genmask.nodetable')
        assert filecmp.cmp(testfilename, maskfile.name) is True
コード例 #22
0
ファイル: localize.py プロジェクト: kevlar-dev/kevlar
def main(args):
    contigstream = kevlar.seqio.afxstream(args.contigs)
    if args.part_id:
        pstream = kevlar.parse_single_partition(contigstream, args.part_id)
    else:
        pstream = kevlar.parse_partitioned_reads(contigstream)
    outstream = kevlar.open(args.out, 'w')
    localizer = localize(
        pstream, args.refr, seedsize=args.seed_size, delta=args.delta,
        maxdiff=args.max_diff, inclpattern=args.include,
        exclpattern=args.exclude,
    )
    for part, gdna in localizer:
        seqname = gdna.defline
        if part is not None:
            seqname += ' kvcc={}'.format(part)
        record = kevlar.sequence.Record(name=seqname, sequence=gdna.sequence)
        kevlar.sequence.write_record(record, outstream)
コード例 #23
0
ファイル: alac.py プロジェクト: kevlar-dev/kevlar
def main(args):
    readstream = kevlar.parse_augmented_fastx(kevlar.open(args.infile, 'r'))
    if args.part_id:
        pstream = kevlar.parse_single_partition(readstream, args.part_id)
    else:
        pstream = kevlar.parse_partitioned_reads(readstream)
    outstream = kevlar.open(args.out, 'w')
    workflow = alac(
        pstream,
        args.refr,
        threads=args.threads,
        ksize=args.ksize,
        maxreads=args.max_reads,
        delta=args.delta,
        seedsize=args.seed_size,
        maxdiff=args.max_diff,
        inclpattern=args.include,
        exclpattern=args.exclude,
        match=args.match,
        mismatch=args.mismatch,
        gapopen=args.open,
        gapextend=args.extend,
        min_ikmers=args.min_ikmers,
        maskfile=args.gen_mask,
        maskmem=args.mask_mem,
        maskmaxfpr=args.mask_max_fpr,
        maxtargetlen=args.max_target_length,
    )

    writer = kevlar.vcf.VCFWriter(
        outstream,
        source='kevlar::alac',
        refr=args.refr,
    )
    writer.write_header()
    for varcall in workflow:
        writer.write(varcall)
コード例 #24
0
def test_partition_reader_mixed():
    infile = data_file('part-reads-mixed.fa')
    readstream = kevlar.parse_augmented_fastx(kevlar.open(infile, 'r'))
    errormsg = r'with and without partition labels'
    with pytest.raises(KevlarPartitionLabelError, match=errormsg):
        partitions = list(kevlar.parse_partitioned_reads(readstream))
コード例 #25
0
    '--out-pattern',
    metavar='REGEX',
    help='out file name pattern with a {} placeholder for partition ID')
parser.add_argument('augfastx')
parser.add_argument('partition', nargs='+')
args = parser.parse_args()

if args.out and args.out_pattern:
    raise Exception('cannot give outfile and outpattern together')
elif args.out and not args.out_pattern:
    args.out = kevlar.open(args.out, 'w')
elif not args.out and not args.out_pattern:
    args.out = sys.stdout

partids = set(args.partition)
fh = kevlar.open(args.augfastx, 'r')
reader = kevlar.parse_augmented_fastx(fh)
preader = kevlar.parse_partitioned_reads(reader)
for partid, partition in preader:
    if partid not in partids:
        continue
    if args.out_pattern:
        pattern = str(args.out_pattern)
        outfile = pattern.format(partid)
        with kevlar.open(outfile, 'w') as out:
            for read in partition:
                kevlar.print_augmented_fastx(read, out)
    else:
        for read in partition:
            kevlar.print_augmented_fastx(read, args.out)
コード例 #26
0
ファイル: test_seqio.py プロジェクト: jchow32/kevlar
def test_partition_reader_mixed():
    infile = kevlar.tests.data_file('part-reads-mixed.fa')
    readstream = kevlar.parse_augmented_fastx(kevlar.open(infile, 'r'))
    with pytest.raises(KevlarPartitionLabelError) as ple:
        partitions = [p for p in kevlar.parse_partitioned_reads(readstream)]
    assert 'with and without partition labels' in str(ple)
コード例 #27
0
ファイル: call.py プロジェクト: kevlar-dev/kevlar
def main(args):
    # Input and output files
    outstream = kevlar.open(args.out, 'w')
    writer = kevlar.vcf.VCFWriter(
        outstream,
        source='kevlar::call',
        refr=args.refr,
    )
    writer.write_header()

    # Contigs = query sequences
    contigstream = kevlar.parse_partitioned_reads(
        kevlar.parse_augmented_fastx(kevlar.open(args.queryseq, 'r')))
    contigs_by_partition = load_contigs(contigstream)

    gdnastream = kevlar.parse_partitioned_reads(
        kevlar.reference.load_refr_cutouts(kevlar.open(args.targetseq, 'r')))
    mask = None
    if args.gen_mask:
        message = 'generating mask of variant-spanning k-mers'
        kevlar.plog('[kevlar::call]', message)
        ntables = 4
        buckets = args.mask_mem * _buckets_per_byte['nodegraph'] / ntables
        mask = khmer.Nodetable(args.ksize, buckets, ntables)
    progress_indicator = kevlar.ProgressIndicator(
        '[kevlar::call] processed contigs/gDNAs for {counter} partitions',
        interval=10,
        breaks=[100, 1000, 10000],
    )
    for partid, gdnas in gdnastream:
        progress_indicator.update()
        if partid not in contigs_by_partition:
            continue
        contigs = contigs_by_partition[partid]
        caller = call(
            gdnas,
            contigs,
            partid,
            match=args.match,
            mismatch=args.mismatch,
            gapopen=args.open,
            gapextend=args.extend,
            ksize=args.ksize,
            refrfile=args.refr,
            debug=args.debug,
            mindist=5,
            homopolyfilt=not args.no_homopoly_filter,
            maxtargetlen=args.max_target_length,
        )
        for varcall in caller:
            if args.gen_mask:
                window = varcall.attribute('ALTWINDOW')
                if window is not None and len(window) >= args.ksize:
                    mask.consume(window)
            writer.write(varcall)
    if args.gen_mask:
        fpr = khmer.calc_expected_collisions(mask, max_false_pos=1.0)
        if fpr > args.mask_max_fpr:
            message = 'WARNING: mask FPR is {:.4f}'.format(fpr)
            message += '; exceeds user-specified limit'
            message += ' of {:.4f}'.format(args.mask_max_fpr)
            kevlar.plog('[kevlar::call]', message)
        mask.save(args.gen_mask)