def test_consume_with_mask_threshold(): """ Test bulk loading with a mask and an abundance threshold The top sequence is the mask, the bottom sequence is to be loaded. The bottom 3 k-mers are not present in the mask and therefore should be the only ones loaded into the counttable. TAGATCTGCTTGAAACAAGTGGATTTGAGAAAAA | TAGATCTGCTTGAAACAAGTGGATTTGAGAAAAA | TAGATCTGCTTGAAACAAGTGGATTTGAGAAAAA | <--- mask input TAGATCTGCTTGAAACAAGTGGATTTGAGAAAAAAGT | TAGATCTGCTTGAAACAAGTGGATTTGAGAAAAAAGT | ATCTGCTTGAAACAAGTGGATTTGAGAAAAAAGT <--- sequence |-----------| |-----------| <--- only these k-mers are |-----------| abundance <= 3 in the mask """ mask = khmer.Counttable(13, 1e3, 4) for _ in range(3): mask.consume('TAGATCTGCTTGAAACAAGTGGATTTGAGAAAAA') for _ in range(2): mask.consume('TAGATCTGCTTGAAACAAGTGGATTTGAGAAAAAAGT') infile = utils.get_test_data('seq-b.fa') ct = khmer.Counttable(13, 1e3, 4) nr, nk = ct.consume_seqfile_with_mask(infile, mask, 3) assert nr == 1 assert nk == 3 assert ct.get('GATTTGAGAAAAA') == 0 # in the mask assert ct.get('ATTTGAGAAAAAA') == 1 assert ct.get('TTTGAGAAAAAAG') == 1 assert ct.get('TTGAGAAAAAAGT') == 1
def test_banding_in_memory(ksize, memory, epsilon, numbands): """ Test accuracy of banding functionally. Tests whether k-mer counts loaded into separate counttables in bands gives reasonable behavior compared to k-mer counts computed in the normal fashion. """ infile = utils.get_test_data('banding-reads.fq') ct_normal = khmer.Counttable(ksize, memory / 4, 4) ct_normal.consume_seqfile(infile) ct_banded = list() for band in range(numbands): ct = khmer.Counttable(ksize, memory / 4 / numbands, 4) ct.consume_seqfile_banding(infile, numbands, band) ct_banded.append(ct) for n, record in enumerate(screed.open(infile)): if not (n > 0 and n % 100 == 0): continue for kmer in ct_normal.get_kmers(record.sequence): abund_normal = ct_normal.get(kmer) abunds_banded = [ct.get(kmer) for ct in ct_banded] # Ideally, we'd like to enforce that these are equal. Practically, # with false positives, we have to allow for a small difference. assert abs(sum(abunds_banded) - abund_normal) <= epsilon nonzeros = [a for a in abunds_banded if a > 0] # False positives shouldn't be appearing in multiple bands assert len(nonzeros) <= 2 # False positives shouldn't have high abundance if len(nonzeros) > 1: assert min(nonzeros) == 1
def test_banding_to_disk(ksize, memory, numbands): """ Test accuracy of banding in terms of the data structure contents. Stronger than the functional in-memory test, this function tests whether a computing k-mer abundances in banding mode produces the same data structure as counting k-mer abundances in the normal fashion. """ infile = utils.get_test_data('banding-reads.fq') path1 = utils.get_temp_filename('normal.ct') path2 = utils.get_temp_filename('banding.ct') ct = khmer.Counttable(ksize, memory / 4, 4) ct.consume_seqfile(infile) ct.save(path1) fpr = khmer.calc_expected_collisions(ct) print('FPR', fpr) ct = khmer.Counttable(ksize, memory / 4, 4) for band in range(numbands): ct.consume_seqfile_banding(infile, numbands, band) ct.save(path2) fpr = khmer.calc_expected_collisions(ct) print('FPR', fpr) with open(path1, 'rb') as f1, open(path2, 'rb') as f2: assert f1.read() == f2.read()
def test_consume_with_mask(): """ Test bulk loading with a mask The top sequence is the mask, the bottom sequence is to be loaded. The bottom 3 k-mers are not present in the mask and therefore should be the only ones loaded into the counttable. TAGATCTGCTTGAAACAAGTGGATTTGAGAAAAA <--- mask ATCTGCTTGAAACAAGTGGATTTGAGAAAAAAGT <--- sequence |-----------| |-----------| |-----------| """ maskfile = utils.get_test_data('seq-a.fa') mask = khmer.Counttable(13, 1e3, 4) mask.consume_seqfile(maskfile) infile = utils.get_test_data('seq-b.fa') ct = khmer.Counttable(13, 1e3, 4) nr, nk = ct.consume_seqfile_with_mask(infile, mask) assert nr == 1 assert nk == 3 assert ct.get('GATTTGAGAAAAA') == 0 # in the mask assert ct.get('ATTTGAGAAAAAA') == 1 assert ct.get('TTTGAGAAAAAAG') == 1 assert ct.get('TTGAGAAAAAAGT') == 1
def minitrio(): kid = khmer.Counttable(31, 1e6, 4) mom = khmer.Counttable(31, 1e6, 4) dad = khmer.Counttable(31, 1e6, 4) ref = khmer.SmallCounttable(31, 125000, 4) kid.consume_seqfile(data_file('minitrio/trio-proband.fq.gz')) mom.consume_seqfile(data_file('minitrio/trio-mother.fq.gz')) dad.consume_seqfile(data_file('minitrio/trio-father.fq.gz')) ref.consume_seqfile(data_file('minitrio/refr.fa')) return kid, mom, dad, ref
def test_ikmer_abund_after_recalc(): """ Ensure interesting k-mer abundances are correct after recalculation. The interesting k-mer has an advertised abundance of 28, but a true abundance (in `counts`) of 10. The readset "validate" function should check and correct this. """ read = screed.Record( name='read1', sequence='AAGCAGGGGTCTACATTGTCCTCGGGACTCGAGATTTCTTCGCTGT', ikmers=[KmerOfInterest('CATTGTCCTCGGGACTC', 13, [28, 0, 0])], ) counts = khmer.Counttable(17, 1e5, 4) seq = 'TTCGTTCCCGAAGCAGGGGTCTACATTGTCCTCGGGACTCGAGATTTCTTCGCTGTTCCGTCCTTCA' for _ in range(10): counts.consume(seq) rs = ReadSet() rs.add(read) assert read.ikmers[0].abund[0] == 28 rs.validate(counts, minabund=8) assert rs.valid == (1, 1) assert read.ikmers[0].abund[0] == 10
def first_pass(reads, mask, memory, timer): kevlar.plog('[kevlar::filter] First pass: re-counting k-mers') timer.start('firstpass') counts = None progress_indicator = kevlar.ProgressIndicator( '[kevlar::filter] processed {counter} reads', interval=1e5, breaks=[1e6, 1e7], ) for n, read in enumerate(reads, 1): progress_indicator.update() if len(read.annotations) == 0: continue if counts is None: ksize = read.annotations[0].ksize counts = khmer.Counttable(ksize, memory / 4, 4) for ikmer in read.annotations: ikseq = read.ikmerseq(ikmer) if mask and mask.get(ikseq) > 0: continue counts.add(ikseq) elapsed = timer.stop('firstpass') message = 'First pass complete!' message += ' Processed {:d} reads in {:.2f} seconds!'.format(n, elapsed) kevlar.plog('[kevlar::filter]', message) return counts
def dist(infiles, mask, ksize=31, memory=1e6, threads=1): counts = khmer.Counttable(ksize, memory / 4, 4) count_first_pass(infiles, counts, mask, nthreads=threads) abundance = count_second_pass(infiles, counts, nthreads=threads) mu, sigma = calc_mu_sigma(abundance) data = compute_dist(abundance) return mu, sigma, data
def test_get_kmer_hashes(): s = "ATGGATATGGAGGACAAGTATATGGAGGACAAGTATATGGAGGACAAGTAT" a = khmer.Counttable(33, 1e6, 3) assert a.get_kmer_hashes(s[:33]) == [4743239192574154715] assert a.get_kmer_hashes( s[:34]) == [4743239192574154715, 2122462908541313313] assert a.get_kmer_hashes(s[0:33]) == [4743239192574154715] assert a.get_kmer_hashes(s[1:34]) == [2122462908541313313]
def test_countgraph_vs_table(): x = khmer.Counttable(4, 21, 3) y = khmer.Countgraph(4, 21, 3) assert hasattr(x, 'add') assert hasattr(y, 'add') assert not hasattr(x, 'consume_and_tag') assert hasattr(y, 'consume_and_tag')
def __init__(self, ksize, abundmem): self._reads = dict() self._counts = khmer.Counttable(ksize, abundmem / 4, 4) self._readcounts = defaultdict(int) self._ikmercounts = defaultdict(int) self._masked = defaultdict(int) self._lowabund = defaultdict(int) self._valid = defaultdict(int) self._novalidkmers_count = 0
def test_consume_with_mask_complement(): mask = khmer.Nodetable(13, 1e3, 4) mask.consume('TGCTTGAAACAAGTG') infile = utils.get_test_data('seq-b.fa') ct = khmer.Counttable(13, 1e3, 4) nr, nk = ct.consume_seqfile_with_mask(infile, mask, threshold=1, consume_masked=True) assert ct.get_kmer_counts('TGCTTGAAACAAGTG') == [1, 1, 1] assert ct.get_kmer_counts('GAAACAAGTGGATTT') == [0, 0, 0]
def test_consume_banding_with_mask(): """ Test bulk loading with a mask *in k-mer banding mode* TAGATCTGCTTGAAACAAGTGGATTTGAGAAAAA <--- mask ATCTGCTTGAAACAAGTGGATTTGAGAAAAAAGT <--- sequence |-----------| |-----------| |-----------| <--- only k-mer in band 1/4 """ maskfile = utils.get_test_data('seq-a.fa') mask = khmer.Counttable(13, 1e3, 4) mask.consume_seqfile(maskfile) infile = utils.get_test_data('seq-b.fa') ct = khmer.Counttable(13, 1e3, 4) nr, nk = ct.consume_seqfile_banding_with_mask(infile, 4, 1, mask) assert nr == 1 assert nk == 1 assert ct.get('GATTTGAGAAAAA') == 0 # in the mask assert ct.get('ATTTGAGAAAAAA') == 0 # out of band assert ct.get('TTTGAGAAAAAAG') == 0 # out of band assert ct.get('TTGAGAAAAAAGT') == 1
def get_unique_kmers(recordstream, ksize=31): """ Grab all unique k-mers from the specified sequence file. Input is expected to be an iterable containing screed or khmer sequence records. """ ct = khmer.Counttable(ksize, 1, 1) kmers = set() for record in recordstream: for kmer in ct.get_kmers(record.sequence): minkmer = kevlar.revcommin(kmer) if minkmer not in kmers: kmers.add(minkmer) yield kmer
def kmerproduce(infile, k, outfile): kmerlist = [] for kmer in product('ACGT', repeat=k): kmerlist.append(''.join(kmer)) df = [] seqids = [] for seq in khmer.ReadParser(infile): seqids.append(seq.name) df.append([]) counts = khmer.Counttable(k, 1e6, 4) counts.set_use_bigcount(True) counts.consume(seq.sequence.upper()) for kmer in kmerlist: df[-1].append(counts.get(kmer)) df = pd.DataFrame(df, columns=kmerlist, index=seqids) df.T.reset_index().to_csv(outfile, index=False, sep='\t')
def test_counttable_no_unhash(): x = khmer.Counttable(4, 21, 3) with pytest.raises(ValueError): x.reverse_hash(1)
#!/usr/bin/env python # A demonstration of khmer's primary sequence loading function. import khmer import sys ksize = 21 target_table_size = 5e8 num_tables = 4 counts = khmer.Counttable(ksize, target_table_size, num_tables) nseqs, nkmers = counts.consume_seqfile(sys.argv[1]) print('Loaded', nseqs, 'sequences and', nkmers, 'k-mers from', sys.argv[1]) print('The kmer "CAGCGCCGTGTTGTTGCAATT" appears', counts.get('CAGCGCCGTGTTGTTGCAATT'), 'times in the data') print('The kmer "GATTACAGATTACAGATTACA" appears', counts.get('GATTACAGATTACAGATTACA'), 'times in the data')
def load_sample_seqfile(seqfiles, ksize, memory, maxfpr=0.2, mask=None, maskmaxabund=1, numbands=None, band=None, outfile=None, numthreads=1, logfile=sys.stderr): """ Compute k-mer abundances for the specified sequence input. Expected input is a list of one or more FASTA/FASTQ files corresponding to a single sample. A counttable is created and populated with abundances of all k-mers observed in the input. If `mask` is provided, only k-mers not present in the mask will be loaded. """ message = 'loading from ' + ','.join(seqfiles) print('[kevlar::count] ', message, file=logfile) sketch = khmer.Counttable(ksize, memory / 4, 4) n, nkmers = 0, 0 for seqfile in seqfiles: parser = khmer.ReadParser(seqfile) threads = list() for _ in range(numthreads): if mask: if numbands: thread = threading.Thread( target=sketch.consume_seqfile_banding_with_mask, args=( parser, numbands, band, mask, ), ) else: thread = threading.Thread( target=sketch.consume_seqfile_with_mask, args=( parser, mask, ), ) else: if numbands: thread = threading.Thread( target=sketch.consume_seqfile_banding, args=( parser, numbands, band, ), ) else: thread = threading.Thread( target=sketch.consume_seqfile, args=(parser, ), ) threads.append(thread) thread.start() for thread in threads: thread.join() message = 'done loading reads' if numbands: message += ' (band {:d}/{:d})'.format(band + 1, numbands) fpr = kevlar.sketch.estimate_fpr(sketch) message += ';\n {:d} reads processed'.format(parser.num_reads) message += ', {:d} distinct k-mers stored'.format(sketch.n_unique_kmers()) message += ';\n estimated false positive rate is {:1.3f}'.format(fpr) if fpr > maxfpr: message += ' (FPR too high, bailing out!!!)' message = '[kevlar::count] ' + message raise kevlar.sketch.KevlarUnsuitableFPRError(message) if outfile: if not outfile.endswith(('.ct', '.counttable')): outfile += '.counttable' sketch.save(outfile) message += ';\n saved to "{:s}"'.format(outfile) print('[kevlar::count] ', message, file=logfile) return sketch
def test_kmer_revcom_hash(kmer): a = khmer.Counttable(21, 1e4, 3) assert a.hash(kmer) == a.hash(khmer.reverse_complement(kmer))