def check_fpr(counts, maxfpr): fpr = kevlar.sketch.estimate_fpr(counts) message = 'FPR for re-computed k-mer counts: {:1.3f}'.format(fpr) kevlar.plog('[kevlar::filter]', message) if fpr > maxfpr: message += 'FPR too high, bailing out!!!' raise kevlar.sketch.KevlarUnsuitableFPRError(message)
def cutout(contigs, refrseqs, seed_matches, seedsize=51, delta=50, maxdiff=None, inclpattern=None, exclpattern=None, debug=False): """Compute reference target sequences for a set of partitioned contigs. Partition by partition, decompose contigs into seeds, determine the genomic location of each seed, calculated the span of all seeds (plus some extension delta), and cut out that interval of the genome. """ localizer = kevlar.localize.Localizer( seedsize, incl=inclpattern, excl=exclpattern ) for contig in contigs: for seed in decompose_seeds(contig.sequence, seedsize): seed = kevlar.revcommin(seed) if seed not in seed_matches: if debug: # pragma: no cover message = 'WARNING: no position for seed {}'.format(seed) kevlar.plog('[kevlar::localize]', message) continue for seqid, position in seed_matches[seed]: localizer.add_seed_match(seqid, position) if maxdiff is None: maxcontiglen = max([len(c.sequence) for c in contigs]) maxdiff = maxcontiglen * 3 cutter = localizer.get_cutouts(refrseqs=refrseqs, delta=delta, clusterdist=maxdiff) for gdna in cutter: yield gdna
def augment(augseqstream, nakedseqstream, upint=10000): """ Augment an unannotated stream of sequences. - `augseqstream`: a stream of sequences annotated with k-mers of interest - `nakedseqstream`: a stream of unannotated sequences, to be augmented with k-mers of interest from `augseqstream` """ ksize = None ikmers = dict() for n, record in enumerate(augseqstream): if n > 0 and n % upint == 0: kevlar.plog('[kevlar::augment] processed', n, 'input reads') for ikmer in record.annotations: seq = record.ikmerseq(ikmer) ikmers[seq] = ikmer.abund ikmers[kevlar.revcom(seq)] = ikmer.abund ksize = ikmer.ksize for record in nakedseqstream: qual = None if hasattr(record, 'quality') and record.quality is not None: qual = record.quality newrecord = kevlar.sequence.Record( name=record.name, sequence=record.sequence, quality=qual, ) numkmers = len(record.sequence) - ksize + 1 for offset in range(numkmers): kmer = record.sequence[offset:offset + ksize] if kmer in ikmers: abund = ikmers[kmer] newrecord.annotate(kmer, offset, abund) yield newrecord
def main(args): if (args.num_bands is None) is not (args.band is None): raise ValueError('Must specify --num-bands and --band together') myband = args.band - 1 if args.band else None if args.mask: args.mask = kevlar.sketch.load(args.mask) print_config(args) timer = kevlar.Timer() timer.start() docount = args.counter_size > 1 dosmallcount = args.counter_size == 4 sketch = load_sample_seqfile(args.seqfile, args.ksize, args.memory, args.max_fpr, count=docount, smallcount=dosmallcount, mask=args.mask, consume_masked=args.count_masked, numbands=args.num_bands, band=myband, numthreads=args.threads, outfile=args.counttable) total = timer.stop() kevlar.plog('[kevlar::count] Total time: {:.2f} seconds'.format(total))
def first_pass(reads, mask, memory, timer): kevlar.plog('[kevlar::filter] First pass: re-counting k-mers') timer.start('firstpass') counts = None progress_indicator = kevlar.ProgressIndicator( '[kevlar::filter] processed {counter} reads', interval=1e5, breaks=[1e6, 1e7], ) for n, read in enumerate(reads, 1): progress_indicator.update() if len(read.annotations) == 0: continue if counts is None: ksize = read.annotations[0].ksize counts = khmer.Counttable(ksize, memory / 4, 4) for ikmer in read.annotations: ikseq = read.ikmerseq(ikmer) if mask and mask.get(ikseq) > 0: continue counts.add(ikseq) elapsed = timer.stop('firstpass') message = 'First pass complete!' message += ' Processed {:d} reads in {:.2f} seconds!'.format(n, elapsed) kevlar.plog('[kevlar::filter]', message) return counts
def load_samples(counttables=None, filelists=None, ksize=31, memory=1e6, maxfpr=0.2, numbands=None, band=None, numthreads=1, outfilelist=None): assert counttables or filelists if counttables: numctrls = len(counttables) message = 'counttables for {:d} sample(s) provided'.format(numctrls) message += ', any corresponding FASTA/FASTQ input will be ignored ' message += 'for computing k-mer abundances' kevlar.plog('[kevlar::novel] INFO:', message) samples = kevlar.sketch.load_sketchfiles(counttables, maxfpr) else: samples = list() for filelist in filelists: sample = kevlar.count.load_sample_seqfile( filelist, ksize, memory, maxfpr=maxfpr, numbands=numbands, band=band, numthreads=numthreads, ) samples.append(sample) if outfilelist: save_counts(outfilelist, samples) return samples
def assemble(partstream, maxreads=10000): n = 0 pn = 0 progress_indicator = kevlar.ProgressIndicator( '[kevlar::assemble] {counter} partitions assembled', interval=10, breaks=[100, 1000, 10000], usetimer=True, ) for partid, partition in partstream: pn += 1 progress_indicator.update() numreads = len(partition) if numreads > maxreads: # pragma: no cover message = 'skipping partition with {:d} reads'.format(numreads) kevlar.plog('[kevlar::assemble] WARNING:', message) continue for contig in assemble_fml_asm(partition): n += 1 newname = 'contig{}'.format(n) if partid is not None: newname += ' kvcc={}'.format(partid) contig.name = newname yield partid, contig message = 'processed {} partitions'.format(pn) message += ' and assembled {} contigs'.format(n) kevlar.plog('[kevlar::assemble]', message)
def window_check(call, ksize=31): altspan = call.window refspan = call.refrwindow altmissing = altspan is None refmissing = refspan is None altshort = altspan and len(altspan) < ksize refshort = refspan and len(refspan) < ksize if altmissing or refmissing or altshort or refshort: if call.filterstr == 'PASS': message = 'WARNING: stubbornly refusing to compute likelihood:' kevlar.plog('[kevlar::simlike]', message) if altmissing: message = ' missing alt allele spanning window' kevlar.plog('[kevlar::simlike]', message) if refmissing: message = ' missing refr allele spanning window' kevlar.plog('[kevlar::simlike]', message) if altshort: message = ' alt allele spanning window {:s}'.format(altspan) message += ', shorter than k size {:d}'.format(ksize) kevlar.plog('[kevlar::simlike]', message) if refshort: message = ' ref allele spanning window {:s}'.format(refspan) message += ', shorter than k size {:d}'.format(ksize) kevlar.plog('[kevlar::simlike]', message) return True return False
def bwa_align(cmdargs, seqstring=None, seqfilename=None): if (not seqstring) is (not seqfilename): raise Exception('supply sequence string or file, not both') with TemporaryFile() as samfile: kmerseqs = dict() if seqstring: bwaproc = Popen(cmdargs, stdin=PIPE, stdout=samfile, stderr=PIPE, universal_newlines=True) stdout, stderr = bwaproc.communicate(input=seqstring) else: bwaproc = Popen(cmdargs, stdout=samfile, stderr=PIPE) stdout, stderr = bwaproc.communicate() if bwaproc.returncode != 0: kevlar.plog(sys.stderr) raise KevlarBWAError('problem running BWA') samfile.seek(0) sam = pysam.AlignmentFile(samfile, 'r') for record in sam: if record.is_unmapped: continue seqid = sam.get_reference_name(record.reference_id) seq = record.seq if seq: kmerseqs[record.query_name] = seq else: seq = kmerseqs[record.query_name] yield seqid, record.reference_start, record.reference_end, seq
def second_pass(reads, counts, casemin, ctrlmax, timer): kevlar.plog('[kevlar::filter] Second pass: discarding k-mers/reads') timer.start('secondpass') kept = 0 progress_indicator = kevlar.ProgressIndicator( '[kevlar::filter] processed {counter} reads', interval=1e5, breaks=[1e6, 1e7], ) for read in reads: progress_indicator.update() validated_kmers = list() for ikmer in read.annotations: ikseq = read.ikmerseq(ikmer) ctrltoohigh = sum([1 for a in ikmer.abund[1:] if a > ctrlmax]) > 0 if ctrltoohigh: continue newcount = counts.get(ikseq) casetoolow = newcount < casemin if casetoolow: continue newabund = tuple([newcount] + list(ikmer.abund[1:])) newikmer = KmerOfInterest(ikmer.ksize, ikmer.offset, newabund) validated_kmers.append(newikmer) if len(validated_kmers) == 0: continue read.annotations = validated_kmers yield read kept += 1 elapsed = timer.stop('secondpass') message = 'Second pass complete!' message += ' Validated {:d} reads in {:.2f} seconds!'.format(kept, elapsed) kevlar.plog('[kevlar::filter]', message)
def update(self): if self.counter in self.breaks: self.interval = self.counter if self.counter >= self.nextupdate: self.nextupdate += self.interval message = self.message.format(counter=self.counter) if self.timer: elapsed = self.timer.probe() message += ' ({:.2f} seconds elapsed)'.format(elapsed) kevlar.plog(message) self.counter += 1
def load_contigs(contigstream): kevlar.plog('[kevlar::call] Loading contigs into memory by partition') contigs_by_partition = dict() nparts = 0 ncontigs = 0 for partid, contiglist in contigstream: nparts += 1 ncontigs += len(contiglist) contigs_by_partition[partid] = contiglist message = 'Loaded {} contigs from {} partitions'.format(ncontigs, nparts) kevlar.plog('[kevlar::call]', message) return contigs_by_partition
def filter(readfile, mask=None, memory=1e6, maxfpr=0.01, casemin=6, ctrlmax=1): timer = kevlar.Timer() timer.start() reader = kevlar.parse_augmented_fastx(kevlar.open(readfile, 'r')) counts = first_pass(reader, mask, memory, timer) check_fpr(counts, maxfpr) reader = kevlar.parse_augmented_fastx(kevlar.open(readfile, 'r')) for read in second_pass(reader, counts, casemin, ctrlmax, timer): yield read total = timer.stop() message = 'Total time: {:.2f} seconds'.format(total) kevlar.plog('[kevlar::filter]', message)
def save_counts(filelist, tablelist): if len(filelist) != len(tablelist): msg = 'number of filenames provided ({:d})'.format(len(filelist)) msg += 'does not match the number of ' msg += 'samples provided ({:d})'.format(len(tablelist)) msg += '; stubbornly refusing to save k-mer counts' kevlar.plog('[kevlar::novel] WARNING:', msg) return for outfile, counttable in zip(filelist, tablelist): if not outfile.endswith(('.ct', '.counttable')): outfile += '.counttable' kevlar.plog(' saved to "{}"'.format(os.path.abspath(outfile))) counttable.save(outfile)
def write_records_to_batches(recordstream, batchfiles): numbatches = len(batchfiles) message = 'writing records to {:d} temp batch files'.format(numbatches) kevlar.plog('[kevlar::unband]', message) progress_indicator = kevlar.ProgressIndicator( '[kevlar::unband] processed {counter} reads', interval=1e5, breaks=[1e6, 1e7], ) for record in recordstream: progress_indicator.update() batch = hash(record.name) % numbatches fh = batchfiles[batch] kevlar.print_augmented_fastx(record, fh)
def get_seed_matches(seedfile, refrfile, seedsize=51): """Determine the position of all seeds with a single system call to BWA.""" kevlar.plog('[kevlar::localize] computing seed matches') bwa_cmd = 'bwa mem -k {k} -T {k} -a -c 5000 {idx} {seeds}'.format( k=seedsize, idx=refrfile, seeds=seedfile ) bwa_args = bwa_cmd.split() seed_index = defaultdict(set) for seqid, start, end, seq in bwa_align(bwa_args, seqfilename=seedfile): minseq = kevlar.revcommin(seq) seed_index[minseq].add((seqid, start)) message = 'found positions for {} seeds'.format(len(seed_index)) kevlar.plog('[kevlar::localize]', message) return seed_index
def load_sketchfiles(sketchfiles, maxfpr=0.2): """Load samples from pre-computed k-mer abundances.""" sketches = list() for sketchfile in sketchfiles: message = 'loading sketchfile "{}"...'.format(sketchfile) kevlar.plog('[kevlar::sketch] ', message, end='') sketch = autoload(sketchfile) fpr = estimate_fpr(sketch) message = 'done! estimated false positive rate is {:1.3f}'.format(fpr) if fpr > maxfpr: message += ' (FPR too high, bailing out!!!)' raise KevlarUnsuitableFPRError(message) kevlar.plog(message) sketches.append(sketch) return sketches
def varfilter(callstream, maskstream): callindex = load_predictions(callstream) message = 'Filtering preliminary variant calls' kevlar.plog('[kevlar::varfilter]', message) progress_indictator = kevlar.ProgressIndicator( '[kevlar::varfilter] {counter} regions processed', interval=1e5, breaks=[1e6, 1e6, 1e7], usetimer=True, ) for chrom, start, end, data in maskstream: hits = callindex.query(chrom, start, end) for interval in hits: interval.data.filter(kevlar.vcf.VariantFilter.UserFilter) progress_indictator.update() for varcall in callindex: yield varcall
def print_config(args): tabletypes = {1: 'node', 4: 'small count', 8: 'count'} maxcounts = {1: 1, 4: 15, 8: 255} tabletype = tabletypes[args.counter_size] message = 'Storing k-mers in a {} table'.format(tabletype) if args.counter_size == 1: message += ' (Bloom filter)' message += ' for k-mer presence/absence queries' else: message += ', a CountMin sketch' maxcount = maxcounts[args.counter_size] message += ' with a counter size of {} bits'.format(args.counter_size) message += ', for k-mer abundance queries' message += ' (max abundance {})'.format(maxcount) kevlar.plog('[kevlar::count]', message)
def prelim_call(targetlist, querylist, partid=None, match=1, mismatch=2, gapopen=5, gapextend=0, ksize=31, refrfile=None, debug=False, mindist=5, homopolyfilt=True, maxtargetlen=10000): """Implement the `kevlar call` procedure as a generator function.""" for query in sorted(querylist, reverse=True, key=len): alignments = list() for target in sorted(targetlist, key=lambda cutout: cutout.defline): nocall = False if maxtargetlen and len(target) > maxtargetlen: nocall = True mapping = VariantMapping( query, target, match=match, mismatch=mismatch, gapopen=gapopen, gapextend=gapextend, homopolyfilt=homopolyfilt, nocall=nocall, ) alignments.append(mapping) aligns2report = alignments_to_report(alignments) for n, alignment in enumerate(aligns2report): if debug: kevlar.plog( 'DEBUG ', alignment.cutout.defline, ' vs ', alignment.contig.name, '\n', str(alignment), sep='', end='\n\n', ) for varcall in alignment.call_variants(ksize, mindist): if partid is not None: varcall.annotate('PART', partid) yield varcall
def contigs_2_seeds(partstream, seedstream, seedsize=51): """Convert a stream of partitioned contigs to seeds and write to a file.""" message = 'decomposing contigs into seeds of length {}'.format(seedsize) kevlar.plog('[kevlar::localize]', message) seeds = set() for partition in partstream: contigs = list(partition) for contig in contigs: for seed in decompose_seeds(contig.sequence, seedsize): seeds.add(kevlar.revcommin(seed)) n = 0 for n, seed in enumerate(sorted(seeds)): print('>seed{}\n{}'.format(n, seed), file=seedstream) seedstream.flush() message = 'contigs decomposed into {} seeds'.format(n) kevlar.plog('[kevlar::localize]', message)
def __iter__(self): for line in self._in: if not line.startswith('#'): message = 'WARNING: VCF file has no samples annotated' message += ', certain sanity checks disabled' kevlar.plog('[kevlar::vcf]', message) yield self._variant_from_vcf_string(line) break if not line.startswith('#CHROM\t'): continue self._save_samples(line) break for line in self._in: if line.startswith('#'): continue yield self._variant_from_vcf_string(line)
def autoindex(refrfile): if not os.path.isfile(refrfile): message = 'reference file {:s} does not exist'.format(refrfile) raise KevlarBWAError(message) bwtfile = refrfile + '.bwt' if os.path.isfile(bwtfile): return message = 'WARNING: BWA index not found for "{:s}"'.format(refrfile) message += ', indexing now' kevlar.plog('[kevlar::reference]', message) try: check_call(['bwa', 'index', refrfile]) except Exception as err: # pragma: no cover raise KevlarBWAError('Could not run "bwa index"') from err
def main(arglist=None): """Entry point for the kevlar CLI. Isolated as a method so that the CLI can be called by other Python code (e.g. for testing), in which case the arguments are passed to the function. If no arguments are passed to the function, parse them from the command line. """ args = kevlar.cli.parse_args(arglist) if args.cmd is None: # pragma: no cover kevlar.cli.parser().parse_args(['-h']) assert args.cmd in kevlar.cli.mains mainmethod = kevlar.cli.mains[args.cmd] versionmessage = '[kevlar] running version {}'.format(kevlar.__version__) kevlar.plog(versionmessage) mainmethod(args)
def split(pstream, outstreams, maxreads=10000): """Split the partitions across the N outstreams.""" progress_indicator = kevlar.ProgressIndicator( '[kevlar::split] processed {counter} partitions', interval=100, breaks=[1000, 10000, 100000], usetimer=True, ) for partdata, outstream in zip(pstream, cycle(outstreams)): partid, partition = partdata if len(partition) > maxreads: message = 'WARNING: discarding partition ' message += 'with {} reads'.format(len(partition)) kevlar.plog('[kevlar::split]', message) continue for read in partition: kevlar.print_augmented_fastx(read, outstream) progress_indicator.update()
def _variant_from_vcf_string(self, vcfstr): fields = vcfstr.strip().split('\t') seqid = fields[0] pos = '.' if fields[1] == '.' else int(fields[1]) - 1 refr = fields[3] alt = fields[4] filterstr = fields[6] variant = Variant(seqid, pos, refr, alt) for kvp in fields[7].split(';'): if '=' in kvp: key, values = kvp.split('=') for value in values.split(','): variant.annotate(key, value) else: variant.annotate(kvp, True) if filterstr not in ('.', 'PASS'): for filterlabel in filterstr.split(';'): if hasattr(VariantFilter, filterlabel): variant.filter(VariantFilter[filterlabel]) elif not self.suppress_filter_warnings: message = 'filter "{}" not recognized'.format(filterstr) message += '; attempting to write this variant to VCF' message += ' will probably turn out poorly' kevlar.plog('[kevlar::vcf]', message) if len(fields) > 9: fmtkeys = fields[8].split(':') sample_data = fields[9:] n_ann_samples = len(self._sample_labels) if n_ann_samples > 0 and len(sample_data) != n_ann_samples: message = 'sample number mismatch: ' + vcfstr raise VariantAnnotationError(message) for label, data in zip(self._sample_labels, sample_data): if data in ('.', './.'): continue fmtvalues = data.split(':') if len(fmtkeys) != len(fmtvalues): message = 'format data mismatch: ' + vcfstr raise VariantAnnotationError(message) for datakey, datavalue in zip(fmtkeys, fmtvalues): variant.format(label, datakey, datavalue) return variant
def count_first_pass(infiles, counts, mask, nthreads=1): message = 'Processing input with {:d} threads'.format(nthreads) kevlar.plog('[kevlar::dist]', message) for filename in infiles: kevlar.plog(' -', filename) parser = khmer.ReadParser(filename) threads = list() for _ in range(nthreads): thread = threading.Thread( target=counts.consume_seqfile_with_mask, args=( parser, mask, ), kwargs={ 'threshold': 1, 'consume_masked': True }, ) threads.append(thread) thread.start() for thread in threads: thread.join() kevlar.plog('[kevlar::dist] Done processing input!')
def count_second_pass(infiles, counts, nthreads=1): kevlar.plog('[kevlar::dist] Second pass over the data') tracking = khmer.Nodetable(counts.ksize(), 1, 1, primes=counts.hashsizes()) abund_lists = list() def __do_abund_dist(parser): abund = counts.abundance_distribution(parser, tracking) abund_lists.append(abund) for filename in infiles: kevlar.plog(' -', filename) parser = khmer.ReadParser(filename) threads = list() for _ in range(nthreads): thread = threading.Thread(target=__do_abund_dist, args=(parser, )) threads.append(thread) thread.start() for thread in threads: thread.join() assert len(abund_lists) == len(infiles) * nthreads abundance = defaultdict(int) for abund in abund_lists: for i, count in enumerate(abund): if i > 0 and count > 0: abundance[i] += count kevlar.plog('[kevlar::dist] Done second pass over input!') return abundance
def main(args): if args.split: kevlar.mkdirp(args.split, trim=True) outstream = None if args.split else kevlar.open(args.out, 'w') readstream = kevlar.parse_augmented_fastx(kevlar.open(args.infile, 'r')) partitioner = partition( readstream, strict=args.strict, minabund=args.min_abund, maxabund=args.max_abund, dedup=args.dedup, gmlfile=args.gml, ) numreads = 0 for partnum, part in partitioner: numreads += len(part) if args.split: ofname = '{:s}.cc{:d}.augfastq.gz'.format(args.split, partnum) with kevlar.open(ofname, 'w') as outfile: for read in part: kevlar.print_augmented_fastx(read, outfile) else: for read in part: kevlar.print_augmented_fastx(read, outstream) message = 'grouped {:d} reads'.format(numreads) message += ' into {:d} connected components'.format(partnum) kevlar.plog('[kevlar::partition]', message)
def resolve_batches(batchfiles): numbatches = len(batchfiles) message = 'resolving duplicate reads in {:d} batches'.format(numbatches) kevlar.plog('[kevlar::unband]', message) for n, batchfile in enumerate(batchfiles): for read in resolve_batch(batchfile): yield read kevlar.plog('[kevlar::unband] batch {:d} complete'.format(n)) kevlar.plog('[kevlar::unband] Done!')