def test_novel_output_has_mates(): kid = data_file('microtrios/trio-na-proband.fq.gz') mom = data_file('microtrios/trio-na-mother.fq.gz') dad = data_file('microtrios/trio-na-father.fq.gz') testnovel = data_file('microtrios/novel-na.augfastq.gz') with NamedTemporaryFile(suffix='.augfastq') as novelfile: arglist = [ 'novel', '--out', novelfile.name, '--case', kid, '--case-min', '5', '--control', mom, '--control', dad, '--ctrl-max', '1', '--memory', '500K' ] args = kevlar.cli.parser().parse_args(arglist) kevlar.novel.main(args) intread_ids = set() mate_seqs = set() stream = kevlar.parse_augmented_fastx(kevlar.open(novelfile.name, 'r')) for read in stream: intread_ids.add(read.name) mate_seqs.update(read.mates) stream = kevlar.parse_augmented_fastx(kevlar.open(testnovel, 'r')) test_ids = set([r.name for r in stream]) assert intread_ids == test_ids stream = kevlar.parse_augmented_fastx(kevlar.open(testnovel, 'r')) test_mate_seqs = set([m for r in stream for m in r.mates]) assert mate_seqs == test_mate_seqs
def main(args): readstream = kevlar.parse_augmented_fastx(kevlar.open(args.infile, 'r')) if args.part_id: pstream = kevlar.parse_single_partition(readstream, args.part_id) else: pstream = kevlar.parse_partitioned_reads(readstream) outstream = kevlar.open(args.out, 'w') workflow = alac(pstream, args.refr, threads=args.threads, ksize=args.ksize, bigpart=args.bigpart, delta=args.delta, seedsize=args.seed_size, maxdiff=args.max_diff, match=args.match, mismatch=args.mismatch, gapopen=args.open, gapextend=args.extend, min_ikmers=args.min_ikmers, logstream=args.logfile) writer = kevlar.vcf.VCFWriter( outstream, source='kevlar::alac', refr=args.refr, ) writer.write_header() for varcall in workflow: writer.write(varcall)
def test_call_max_target_length(contigs, gdnas, maxtargetlen, numpassing): contigfile = data_file(contigs) contigstream = kevlar.parse_augmented_fastx(kevlar.open(contigfile, 'r')) partstream = kevlar.parse_partitioned_reads(contigstream) contigs = kevlar.call.load_contigs(partstream) gdnafile = data_file(gdnas) gdnastream = kevlar.reference.load_refr_cutouts(kevlar.open(gdnafile, 'r')) partstream = kevlar.parse_partitioned_reads(gdnastream) targets = kevlar.call.load_contigs(partstream) calls = list() for partid in contigs: contiglist = contigs[partid] gdnalist = targets[partid] caller = kevlar.call.call( gdnalist, contiglist, partid=partid, maxtargetlen=maxtargetlen ) calls.extend(list(caller)) nocalls = [c for c in calls if c.seqid == '.'] passcalls = [c for c in calls if c.seqid != '.'] assert len(passcalls) == numpassing for c in nocalls: assert c.seqid == c.position == '.' assert sorted(c.info.keys()) == ['CONTIG', 'IKMERS', 'PART']
def main(args): if args.split: kevlar.mkdirp(args.split, trim=True) outstream = None if args.split else kevlar.open(args.out, 'w') readstream = kevlar.parse_augmented_fastx(kevlar.open(args.infile, 'r')) partitioner = partition(readstream, strict=args.strict, minabund=args.min_abund, maxabund=args.max_abund, dedup=args.dedup, gmlfile=args.gml, logstream=args.logfile) partnum = 0 numreads = 0 for partnum, part in enumerate(partitioner, 1): numreads += len(part) if args.split: ofname = '{:s}.cc{:d}.augfastq.gz'.format(args.split, partnum) with kevlar.open(ofname, 'w') as outfile: for read in part: kevlar.print_augmented_fastx(read, outfile) else: for read in part: read.name += ' kvcc={:d}'.format(partnum) kevlar.print_augmented_fastx(read, outstream) message = '[kevlar::partition] grouped {:d} reads'.format(numreads) message += ' into {:d} connected components'.format(partnum) print(message, file=args.logfile)
def test_split_cli(): infile = data_file('fiveparts.augfastq.gz') tempdir = mkdtemp() print(tempdir) arglist = ['split', infile, '3', tempdir + '/out'] args = kevlar.cli.parser().parse_args(arglist) kevlar.split.main(args) outfile = tempdir + '/out.0.augfastx.gz' readstream = kevlar.parse_augmented_fastx(kevlar.open(outfile, 'r')) partstream = kevlar.parse_partitioned_reads(readstream) partitions = list(partstream) partitions = [part for partid, part in partitions] assert len(partitions) == 2 assert len(partitions[0]) == 67 assert len(partitions[1]) == 12 outfile = tempdir + '/out.1.augfastx.gz' readstream = kevlar.parse_augmented_fastx(kevlar.open(outfile, 'r')) partstream = kevlar.parse_partitioned_reads(readstream) partitions = list(partstream) partitions = [part for partid, part in partitions] assert len(partitions) == 2 assert len(partitions[0]) == 23 assert len(partitions[1]) == 11 outfile = tempdir + '/out.2.augfastx.gz' readstream = kevlar.parse_augmented_fastx(kevlar.open(outfile, 'r')) partstream = kevlar.parse_partitioned_reads(readstream) partitions = list(partstream) partitions = [part for partid, part in partitions] assert len(partitions) == 1 assert len(partitions[0]) == 15 rmtree(tempdir)
def test_call_homopolymer_filter_disabled(): contigfile = data_file('homopolymer/12175-3parts.contigs.augfasta') contigstream = kevlar.parse_augmented_fastx(kevlar.open(contigfile, 'r')) partstream = kevlar.parse_partitioned_reads(contigstream) contigs = kevlar.call.load_contigs(partstream) gdnafile = data_file('homopolymer/12175-3parts.targets.fasta') gdnastream = kevlar.reference.load_refr_cutouts(kevlar.open(gdnafile, 'r')) partstream = kevlar.parse_partitioned_reads(gdnastream) targets = kevlar.call.load_contigs(partstream) prelimcalls = list() for partid in contigs: contiglist = contigs[partid] gdnalist = targets[partid] caller = kevlar.call.call( gdnalist, contiglist, partid=partid, homopolyfilt=False ) prelimcalls.extend(list(caller)) kid = kevlar.sketch.load(data_file('homopolymer/12175-kid.sct')) mom = kevlar.sketch.load(data_file('homopolymer/12175-mom.sct')) dad = kevlar.sketch.load(data_file('homopolymer/12175-dad.sct')) refr = kevlar.sketch.load(data_file('homopolymer/12175-refr.sct')) scorer = kevlar.simlike.simlike( prelimcalls, kid, [mom, dad], refr, samplelabels=['Proband', 'Mother', 'Father'], ) calls = list(scorer) assert len(calls) == 6 for c in calls: assert 'Homopolymer' not in c.filterstr
def main(args): augseqs = kevlar.parse_augmented_fastx(kevlar.open(args.augseqs, 'r')) nakedseqs = kevlar.parse_augmented_fastx(kevlar.open(args.seqs, 'r')) outstream = kevlar.open(args.out, 'w') docollapse = args.collapse_mates for record in augment(augseqs, nakedseqs, collapsemates=docollapse): kevlar.print_augmented_fastx(record, outstream)
def test_nocall(): # Intentionally mismatched qfile = data_file('phony-deletion-01.contig.fa') tfile = data_file('phony-insertion-01.gdna.fa') qinstream = kevlar.parse_augmented_fastx(kevlar.open(qfile, 'r')) query = [record for record in qinstream][0] tinstream = kevlar.reference.load_refr_cutouts(kevlar.open(tfile, 'r')) target = [record for record in tinstream][0] aln = VariantMapping(query, target, 1e6, '25D5M22I5M46D8M13D2M35I') assert aln.offset is None assert aln.targetshort is None assert aln.match is None assert aln.leftflank is None assert aln.indel is None assert aln.indeltype is None assert aln.rightflank is None variants = list(aln.call_variants(21)) assert len(variants) == 1 assert variants[0].vcf == ( 'yourchr\t801\t.\t.\t.\t.\tInscrutableCigar\t' 'CIGAR=25D5M22I5M46D8M13D2M35I;KSW2=1000000.0;CONTIG=AACTGGTGGGCTCAAGA' 'CTAAAAAGACTTTTTTGGTGACAAGCAGGGCGGCCTGCCCTTCCTGTAGTGCAAGAAAAT')
def main(args): print('[kevlar::mutate] loading mutations', file=args.logfile) mutations = load_mutations(kevlar.open(args.mutations, 'r'), args.logfile) print('[kevlar::mutate] mutating genome', file=args.logfile) for record in mutate_genome(args.genome, mutations): write_record(record, kevlar.open(args.out, 'w'))
def test_augment_contig_mates(): augfh = kevlar.open(data_file('deadbeef.augfastq.gz'), 'r') augreads = kevlar.parse_augmented_fastx(augfh) nakedfh = kevlar.open(data_file('deadbeef.contig.fa'), 'r') nakedseq = kevlar.parse_augmented_fastx(nakedfh) contigs = list(augment(augreads, nakedseq)) assert len(contigs) == 1 assert len(contigs[0].annotations) == 74
def main(args): contigstream = kevlar.parse_augmented_fastx(kevlar.open(args.contigs, 'r')) outstream = kevlar.open(args.out, 'w') for record in localize(contigstream, args.refr, ksize=args.ksize, delta=args.delta): khmer.utils.write_record(record, outstream)
def main(args): reader = kevlar.vcf.vcfstream(args.vcf) bedstream = kevlar.parse_bed(kevlar.open(args.filt, 'r')) outstream = kevlar.open(args.out, 'w') writer = kevlar.vcf.VCFWriter(outstream, source='kevlar::varfilter') writer.write_header() for varcall in varfilter(reader, bedstream): writer.write(varcall)
def test_suffix(): bamstream = kevlar.open(data_file('nopair.sam'), 'r') refrstream = kevlar.open(data_file('bogus-genome/refr.fa'), 'r') refr = kevlar.seqio.parse_seq_dict(refrstream) records = [r for r in kevlar.dump.dump(bamstream, refr)] assert len(records) == 1 assert records[0].name.endswith('/1') or records[0].name.endswith('/2')
def test_kevlar_open(): thefile = kevlar.tests.data_file('wasp-pass.contig.augfasta') filehandle = kevlar.open(thefile, 'r') filecontents = filehandle.read() assert len(filecontents.strip().split('\n')) == 9 with pytest.raises(ValueError, match=r'invalid mode "p"'): filehandle = kevlar.open(thefile, 'p')
def test_nomargin(): qfile = kevlar.open(data_file('nomargin-r-indel-contigs.augfasta'), 'r') tfile = kevlar.open(data_file('nomargin-r-gdna.fa'), 'r') query = next(kevlar.parse_augmented_fastx(qfile)) target = next(kevlar.parse_augmented_fastx(tfile)) cigar, score = kevlar.align(target.sequence, query.sequence) tok = AlignmentTokenizer(query.sequence, target.sequence, cigar) assert tok._cigar == tok._origcigar
def test_compact(): bedstream = kevlar.open(data_file('compact-test-refr.bed.gz'), 'r') index = kevlar.evaluate.populate_index_from_bed(bedstream) vcfstream = kevlar.open(data_file('compact-test-pred.vcf.gz'), 'r') reader = kevlar.vcf.VCFReader(vcfstream) compactor = kevlar.evaluate.compact(reader, index, delta=10) calls = list(compactor) assert len(calls) == 33
def test_augment_reads_mates(): augfh = kevlar.open(data_file('deadbeef.augfastq.gz'), 'r') augreads = list(kevlar.parse_augmented_fastx(augfh)) nakedfh = kevlar.open(data_file('deadbeef.fq.gz'), 'r') nakedseq = kevlar.parse_augmented_fastx(nakedfh) newreads = list(augment(augreads, nakedseq, upint=5)) for oldread, newread in zip(augreads, newreads): assert oldread.sequence == newread.sequence assert oldread.annotations == newread.annotations
def main(args): fastq = kevlar.open(args.out, 'w') refr = None if args.refr: print('[kevlar::dump] Loading reference sequence', file=args.logfile) refrstream = kevlar.open(args.refr, 'r') refr = kevlar.seqio.parse_seq_dict(refrstream) for read in dump(args.reads, refr, args.pair_mode, logstream=args.logfile): write_record(read, fastq)
def main_greedy(args): readstream = kevlar.parse_augmented_fastx(kevlar.open(args.augfastq, 'r')) outstream = None # Only create output file if there are contigs contigstream = assemble_greedy(readstream, args.gml, args.debug, args.logfile) for contig in contigstream: if outstream is None: outstream = kevlar.open(args.out, 'w') kevlar.print_augmented_fastx(contig, outstream)
def test_gap_center_aligned(contig, gdna, newcigar, origcigar, nblocks): qfile = kevlar.open(data_file('cigar/' + contig), 'r') tfile = kevlar.open(data_file('cigar/' + gdna), 'r') query = next(kevlar.parse_augmented_fastx(qfile)) target = next(kevlar.parse_augmented_fastx(tfile)) cigar, score = kevlar.align(target.sequence, query.sequence) tok = AlignmentTokenizer(query.sequence, target.sequence, cigar) assert len(tok.blocks) == nblocks assert tok._cigar == newcigar assert tok._origcigar == origcigar
def main(args): readstream = kevlar.parse_augmented_fastx(kevlar.open(args.augfastq, 'r')) if args.part_id: pstream = kevlar.parse_single_partition(readstream, args.part_id) else: pstream = kevlar.parse_partitioned_reads(readstream) outstream = kevlar.open(args.out, 'w') assembler = assemble(pstream, maxreads=args.max_reads) for partid, contig in assembler: kevlar.print_augmented_fastx(contig, outstream)
def test_call_near_end(query, target, dist, n, trimcount): contig = next( kevlar.parse_augmented_fastx(kevlar.open(data_file(query), 'r'))) cutout = next( kevlar.reference.load_refr_cutouts(kevlar.open(data_file(target), 'r'))) aln = VariantMapping(contig, cutout) calls = list(aln.call_variants(31, mindist=dist)) assert len(calls) == n assert aln.trimmed == trimcount
def main(args): partfile = kevlar.open(args.infile, 'r') readstream = kevlar.parse_augmented_fastx(partfile) partstream = kevlar.parse_partitioned_reads(readstream) outstreams = list() for i in range(args.numfiles): outfile = '{:s}.{:d}'.format(args.base, i + 1) os = kevlar.open(outfile, 'w') outstreams.append(os) split(partstream, outstreams)
def test_call_num_interesting_kmers(): contig = next( kevlar.parse_augmented_fastx( kevlar.open(data_file('iktest.contig.fa'), 'r'))) cutout = next( kevlar.reference.load_refr_cutouts( kevlar.open(data_file('iktest.gdna.fa'), 'r'))) aln = VariantMapping(contig, cutout) calls = list(aln.call_variants(29)) assert len(calls) == 1 assert calls[0].attribute('IKMERS') == '1'
def test_varfilter_single(): bedstream = kevlar.parse_bed( kevlar.open(data_file('fiveparts-ignore-single.bed'), 'r')) vcffile = data_file('five-snvs-with-likelihood.vcf') with kevlar.open(vcffile, 'r') as vcfstream: reader = kevlar.vcf.VCFReader(vcfstream) varcalls = list(kevlar.varfilter.varfilter(reader, bedstream)) assert len(varcalls) == 5 filtered = [vc for vc in varcalls if vc.filterstr != 'PASS'] assert len(filtered) == 1 assert filtered[0].position == 36385017
def test_augment_contigs(): augfh = kevlar.open(data_file('snorkel.augfastq'), 'r') augreads = kevlar.parse_augmented_fastx(augfh) nakedfh = kevlar.open(data_file('snorkel-contig.fasta'), 'r') nakedseq = kevlar.parse_augmented_fastx(nakedfh) augseqs = list(augment(augreads, nakedseq)) assert len(augseqs) == 1 assert len(augseqs[0].annotations) == 3 offsets = [k.offset for k in augseqs[0].annotations] assert offsets == [17, 20, 22]
def filter(readfile, mask=None, memory=1e6, maxfpr=0.01, casemin=6, ctrlmax=1): timer = kevlar.Timer() timer.start() reader = kevlar.parse_augmented_fastx(kevlar.open(readfile, 'r')) counts = first_pass(reader, mask, memory, timer) check_fpr(counts, maxfpr) reader = kevlar.parse_augmented_fastx(kevlar.open(readfile, 'r')) for read in second_pass(reader, counts, casemin, ctrlmax, timer): yield read total = timer.stop() message = 'Total time: {:.2f} seconds'.format(total) kevlar.plog('[kevlar::filter]', message)
def test_call_pico_indel(ccid, varcall): qfile = data_file('pico' + ccid + '.contig.augfasta') tfile = data_file('pico' + ccid + '.gdna.fa') qinstream = kevlar.parse_augmented_fastx(kevlar.open(qfile, 'r')) queries = [record for record in qinstream] tinstream = kevlar.reference.load_refr_cutouts(kevlar.open(tfile, 'r')) targets = [record for record in tinstream] calls = list(call(targets, queries)) assert len(calls) == 1 assert str(calls[0]) == varcall
def test_gap_center_aligned(): query = next( kevlar.parse_augmented_fastx( kevlar.open(data_file('cigar/b.contig.fa'), 'r'))) target = next( kevlar.parse_augmented_fastx( kevlar.open(data_file('cigar/b.gdna.fa'), 'r'))) cigar, score = kevlar.align(target.sequence, query.sequence) tok = AlignmentTokenizer(query.sequence, target.sequence, cigar) assert len(tok.blocks) == 3 assert tok._cigar == '41D150M50D' assert tok._origcigar == '41D144M50D6M'
def main(args): contigstream = kevlar.parse_augmented_fastx(kevlar.open(args.contigs, 'r')) outstream = kevlar.open(args.out, 'w') localizer = localize(contigstream, args.refr, seedsize=args.seed_size, delta=args.delta, maxdiff=args.max_diff, logstream=args.logfile) for cutout in localizer: record = Record(name=cutout.defline, sequence=cutout.sequence) kevlar.sequence.write_record(record, outstream)