Esempio n. 1
0
def test_consume_with_mask_threshold():
    """
    Test bulk loading with a mask and an abundance threshold

    The top sequence is the mask, the bottom sequence is to be loaded. The
    bottom 3 k-mers are not present in the mask and therefore should be the
    only ones loaded into the counttable.

    TAGATCTGCTTGAAACAAGTGGATTTGAGAAAAA        |
    TAGATCTGCTTGAAACAAGTGGATTTGAGAAAAA        |
    TAGATCTGCTTGAAACAAGTGGATTTGAGAAAAA        | <--- mask input
    TAGATCTGCTTGAAACAAGTGGATTTGAGAAAAAAGT     |
    TAGATCTGCTTGAAACAAGTGGATTTGAGAAAAAAGT     |

       ATCTGCTTGAAACAAGTGGATTTGAGAAAAAAGT     <--- sequence
                          |-----------|
                           |-----------|      <--- only these k-mers are
                            |-----------|          abundance <= 3 in the mask
    """
    mask = khmer.Counttable(13, 1e3, 4)
    for _ in range(3):
        mask.consume('TAGATCTGCTTGAAACAAGTGGATTTGAGAAAAA')
    for _ in range(2):
        mask.consume('TAGATCTGCTTGAAACAAGTGGATTTGAGAAAAAAGT')

    infile = utils.get_test_data('seq-b.fa')
    ct = khmer.Counttable(13, 1e3, 4)
    nr, nk = ct.consume_seqfile_with_mask(infile, mask, 3)

    assert nr == 1
    assert nk == 3
    assert ct.get('GATTTGAGAAAAA') == 0  # in the mask
    assert ct.get('ATTTGAGAAAAAA') == 1
    assert ct.get('TTTGAGAAAAAAG') == 1
    assert ct.get('TTGAGAAAAAAGT') == 1
Esempio n. 2
0
def test_banding_in_memory(ksize, memory, epsilon, numbands):
    """
    Test accuracy of banding functionally.

    Tests whether k-mer counts loaded into separate counttables in bands gives
    reasonable behavior compared to k-mer counts computed in the normal
    fashion.
    """
    infile = utils.get_test_data('banding-reads.fq')

    ct_normal = khmer.Counttable(ksize, memory / 4, 4)
    ct_normal.consume_seqfile(infile)

    ct_banded = list()
    for band in range(numbands):
        ct = khmer.Counttable(ksize, memory / 4 / numbands, 4)
        ct.consume_seqfile_banding(infile, numbands, band)
        ct_banded.append(ct)

    for n, record in enumerate(screed.open(infile)):
        if not (n > 0 and n % 100 == 0):
            continue
        for kmer in ct_normal.get_kmers(record.sequence):
            abund_normal = ct_normal.get(kmer)
            abunds_banded = [ct.get(kmer) for ct in ct_banded]
            # Ideally, we'd like to enforce that these are equal. Practically,
            # with false positives, we have to allow for a small difference.
            assert abs(sum(abunds_banded) - abund_normal) <= epsilon

            nonzeros = [a for a in abunds_banded if a > 0]
            # False positives shouldn't be appearing in multiple bands
            assert len(nonzeros) <= 2
            # False positives shouldn't have high abundance
            if len(nonzeros) > 1:
                assert min(nonzeros) == 1
Esempio n. 3
0
def test_banding_to_disk(ksize, memory, numbands):
    """
    Test accuracy of banding in terms of the data structure contents.

    Stronger than the functional in-memory test, this function tests whether
    a computing k-mer abundances in banding mode produces the same data
    structure as counting k-mer abundances in the normal fashion.
    """
    infile = utils.get_test_data('banding-reads.fq')
    path1 = utils.get_temp_filename('normal.ct')
    path2 = utils.get_temp_filename('banding.ct')

    ct = khmer.Counttable(ksize, memory / 4, 4)
    ct.consume_seqfile(infile)
    ct.save(path1)
    fpr = khmer.calc_expected_collisions(ct)
    print('FPR', fpr)

    ct = khmer.Counttable(ksize, memory / 4, 4)
    for band in range(numbands):
        ct.consume_seqfile_banding(infile, numbands, band)
    ct.save(path2)
    fpr = khmer.calc_expected_collisions(ct)
    print('FPR', fpr)

    with open(path1, 'rb') as f1, open(path2, 'rb') as f2:
        assert f1.read() == f2.read()
Esempio n. 4
0
def test_consume_with_mask():
    """
    Test bulk loading with a mask

    The top sequence is the mask, the bottom sequence is to be loaded. The
    bottom 3 k-mers are not present in the mask and therefore should be the
    only ones loaded into the counttable.

    TAGATCTGCTTGAAACAAGTGGATTTGAGAAAAA        <--- mask
       ATCTGCTTGAAACAAGTGGATTTGAGAAAAAAGT     <--- sequence
                          |-----------|
                           |-----------|
                            |-----------|
    """
    maskfile = utils.get_test_data('seq-a.fa')
    mask = khmer.Counttable(13, 1e3, 4)
    mask.consume_seqfile(maskfile)

    infile = utils.get_test_data('seq-b.fa')
    ct = khmer.Counttable(13, 1e3, 4)
    nr, nk = ct.consume_seqfile_with_mask(infile, mask)

    assert nr == 1
    assert nk == 3
    assert ct.get('GATTTGAGAAAAA') == 0  # in the mask
    assert ct.get('ATTTGAGAAAAAA') == 1
    assert ct.get('TTTGAGAAAAAAG') == 1
    assert ct.get('TTGAGAAAAAAGT') == 1
Esempio n. 5
0
def minitrio():
    kid = khmer.Counttable(31, 1e6, 4)
    mom = khmer.Counttable(31, 1e6, 4)
    dad = khmer.Counttable(31, 1e6, 4)
    ref = khmer.SmallCounttable(31, 125000, 4)
    kid.consume_seqfile(data_file('minitrio/trio-proband.fq.gz'))
    mom.consume_seqfile(data_file('minitrio/trio-mother.fq.gz'))
    dad.consume_seqfile(data_file('minitrio/trio-father.fq.gz'))
    ref.consume_seqfile(data_file('minitrio/refr.fa'))
    return kid, mom, dad, ref
Esempio n. 6
0
def test_ikmer_abund_after_recalc():
    """
    Ensure interesting k-mer abundances are correct after recalculation.

    The interesting k-mer has an advertised abundance of 28, but a true
    abundance (in `counts`) of 10. The readset "validate" function should check
    and correct this.
    """
    read = screed.Record(
        name='read1',
        sequence='AAGCAGGGGTCTACATTGTCCTCGGGACTCGAGATTTCTTCGCTGT',
        ikmers=[KmerOfInterest('CATTGTCCTCGGGACTC', 13, [28, 0, 0])],
    )

    counts = khmer.Counttable(17, 1e5, 4)
    seq = 'TTCGTTCCCGAAGCAGGGGTCTACATTGTCCTCGGGACTCGAGATTTCTTCGCTGTTCCGTCCTTCA'
    for _ in range(10):
        counts.consume(seq)

    rs = ReadSet()
    rs.add(read)
    assert read.ikmers[0].abund[0] == 28

    rs.validate(counts, minabund=8)
    assert rs.valid == (1, 1)
    assert read.ikmers[0].abund[0] == 10
Esempio n. 7
0
def first_pass(reads, mask, memory, timer):
    kevlar.plog('[kevlar::filter] First pass: re-counting k-mers')
    timer.start('firstpass')
    counts = None
    progress_indicator = kevlar.ProgressIndicator(
        '[kevlar::filter]     processed {counter} reads',
        interval=1e5,
        breaks=[1e6, 1e7],
    )
    for n, read in enumerate(reads, 1):
        progress_indicator.update()
        if len(read.annotations) == 0:
            continue
        if counts is None:
            ksize = read.annotations[0].ksize
            counts = khmer.Counttable(ksize, memory / 4, 4)
        for ikmer in read.annotations:
            ikseq = read.ikmerseq(ikmer)
            if mask and mask.get(ikseq) > 0:
                continue
            counts.add(ikseq)
    elapsed = timer.stop('firstpass')
    message = 'First pass complete!'
    message += ' Processed {:d} reads in {:.2f} seconds!'.format(n, elapsed)
    kevlar.plog('[kevlar::filter]', message)
    return counts
Esempio n. 8
0
def dist(infiles, mask, ksize=31, memory=1e6, threads=1):
    counts = khmer.Counttable(ksize, memory / 4, 4)
    count_first_pass(infiles, counts, mask, nthreads=threads)
    abundance = count_second_pass(infiles, counts, nthreads=threads)
    mu, sigma = calc_mu_sigma(abundance)
    data = compute_dist(abundance)
    return mu, sigma, data
Esempio n. 9
0
def test_get_kmer_hashes():
    s = "ATGGATATGGAGGACAAGTATATGGAGGACAAGTATATGGAGGACAAGTAT"
    a = khmer.Counttable(33, 1e6, 3)
    assert a.get_kmer_hashes(s[:33]) == [4743239192574154715]
    assert a.get_kmer_hashes(
        s[:34]) == [4743239192574154715, 2122462908541313313]

    assert a.get_kmer_hashes(s[0:33]) == [4743239192574154715]
    assert a.get_kmer_hashes(s[1:34]) == [2122462908541313313]
Esempio n. 10
0
def test_countgraph_vs_table():
    x = khmer.Counttable(4, 21, 3)
    y = khmer.Countgraph(4, 21, 3)

    assert hasattr(x, 'add')
    assert hasattr(y, 'add')

    assert not hasattr(x, 'consume_and_tag')
    assert hasattr(y, 'consume_and_tag')
Esempio n. 11
0
    def __init__(self, ksize, abundmem):
        self._reads = dict()
        self._counts = khmer.Counttable(ksize, abundmem / 4, 4)
        self._readcounts = defaultdict(int)
        self._ikmercounts = defaultdict(int)

        self._masked = defaultdict(int)
        self._lowabund = defaultdict(int)
        self._valid = defaultdict(int)

        self._novalidkmers_count = 0
Esempio n. 12
0
def test_consume_with_mask_complement():
    mask = khmer.Nodetable(13, 1e3, 4)
    mask.consume('TGCTTGAAACAAGTG')

    infile = utils.get_test_data('seq-b.fa')
    ct = khmer.Counttable(13, 1e3, 4)
    nr, nk = ct.consume_seqfile_with_mask(infile,
                                          mask,
                                          threshold=1,
                                          consume_masked=True)

    assert ct.get_kmer_counts('TGCTTGAAACAAGTG') == [1, 1, 1]
    assert ct.get_kmer_counts('GAAACAAGTGGATTT') == [0, 0, 0]
Esempio n. 13
0
def test_consume_banding_with_mask():
    """
    Test bulk loading with a mask *in k-mer banding mode*

    TAGATCTGCTTGAAACAAGTGGATTTGAGAAAAA        <--- mask
       ATCTGCTTGAAACAAGTGGATTTGAGAAAAAAGT     <--- sequence
                          |-----------|
                           |-----------|
                            |-----------|     <--- only k-mer in band 1/4
    """
    maskfile = utils.get_test_data('seq-a.fa')
    mask = khmer.Counttable(13, 1e3, 4)
    mask.consume_seqfile(maskfile)

    infile = utils.get_test_data('seq-b.fa')
    ct = khmer.Counttable(13, 1e3, 4)
    nr, nk = ct.consume_seqfile_banding_with_mask(infile, 4, 1, mask)

    assert nr == 1
    assert nk == 1
    assert ct.get('GATTTGAGAAAAA') == 0  # in the mask
    assert ct.get('ATTTGAGAAAAAA') == 0  # out of band
    assert ct.get('TTTGAGAAAAAAG') == 0  # out of band
    assert ct.get('TTGAGAAAAAAGT') == 1
Esempio n. 14
0
def get_unique_kmers(recordstream, ksize=31):
    """
    Grab all unique k-mers from the specified sequence file.

    Input is expected to be an iterable containing screed or khmer sequence
    records.
    """
    ct = khmer.Counttable(ksize, 1, 1)
    kmers = set()
    for record in recordstream:
        for kmer in ct.get_kmers(record.sequence):
            minkmer = kevlar.revcommin(kmer)
            if minkmer not in kmers:
                kmers.add(minkmer)
                yield kmer
Esempio n. 15
0
def kmerproduce(infile, k, outfile):
    kmerlist = []
    for kmer in product('ACGT', repeat=k):
        kmerlist.append(''.join(kmer))

    df = []
    seqids = []
    for seq in khmer.ReadParser(infile):
        seqids.append(seq.name)
        df.append([])
        counts = khmer.Counttable(k, 1e6, 4)
        counts.set_use_bigcount(True)
        counts.consume(seq.sequence.upper())
        for kmer in kmerlist:
            df[-1].append(counts.get(kmer))

    df = pd.DataFrame(df, columns=kmerlist, index=seqids)
    df.T.reset_index().to_csv(outfile, index=False, sep='\t')
Esempio n. 16
0
def test_counttable_no_unhash():
    x = khmer.Counttable(4, 21, 3)

    with pytest.raises(ValueError):
        x.reverse_hash(1)
Esempio n. 17
0
#!/usr/bin/env python

# A demonstration of khmer's primary sequence loading function.

import khmer
import sys

ksize = 21
target_table_size = 5e8
num_tables = 4

counts = khmer.Counttable(ksize, target_table_size, num_tables)
nseqs, nkmers = counts.consume_seqfile(sys.argv[1])
print('Loaded', nseqs, 'sequences and', nkmers, 'k-mers from', sys.argv[1])

print('The kmer "CAGCGCCGTGTTGTTGCAATT" appears',
      counts.get('CAGCGCCGTGTTGTTGCAATT'), 'times in the data')
print('The kmer "GATTACAGATTACAGATTACA" appears',
      counts.get('GATTACAGATTACAGATTACA'), 'times in the data')
Esempio n. 18
0
def load_sample_seqfile(seqfiles,
                        ksize,
                        memory,
                        maxfpr=0.2,
                        mask=None,
                        maskmaxabund=1,
                        numbands=None,
                        band=None,
                        outfile=None,
                        numthreads=1,
                        logfile=sys.stderr):
    """
    Compute k-mer abundances for the specified sequence input.

    Expected input is a list of one or more FASTA/FASTQ files corresponding
    to a single sample. A counttable is created and populated with abundances
    of all k-mers observed in the input. If `mask` is provided, only k-mers not
    present in the mask will be loaded.
    """
    message = 'loading from ' + ','.join(seqfiles)
    print('[kevlar::count]    ', message, file=logfile)

    sketch = khmer.Counttable(ksize, memory / 4, 4)
    n, nkmers = 0, 0
    for seqfile in seqfiles:
        parser = khmer.ReadParser(seqfile)
        threads = list()
        for _ in range(numthreads):
            if mask:
                if numbands:
                    thread = threading.Thread(
                        target=sketch.consume_seqfile_banding_with_mask,
                        args=(
                            parser,
                            numbands,
                            band,
                            mask,
                        ),
                    )
                else:
                    thread = threading.Thread(
                        target=sketch.consume_seqfile_with_mask,
                        args=(
                            parser,
                            mask,
                        ),
                    )
            else:
                if numbands:
                    thread = threading.Thread(
                        target=sketch.consume_seqfile_banding,
                        args=(
                            parser,
                            numbands,
                            band,
                        ),
                    )
                else:
                    thread = threading.Thread(
                        target=sketch.consume_seqfile,
                        args=(parser, ),
                    )
            threads.append(thread)
            thread.start()

    for thread in threads:
        thread.join()

    message = 'done loading reads'
    if numbands:
        message += ' (band {:d}/{:d})'.format(band + 1, numbands)
    fpr = kevlar.sketch.estimate_fpr(sketch)
    message += ';\n    {:d} reads processed'.format(parser.num_reads)
    message += ', {:d} distinct k-mers stored'.format(sketch.n_unique_kmers())
    message += ';\n    estimated false positive rate is {:1.3f}'.format(fpr)
    if fpr > maxfpr:
        message += ' (FPR too high, bailing out!!!)'
        message = '[kevlar::count]     ' + message
        raise kevlar.sketch.KevlarUnsuitableFPRError(message)

    if outfile:
        if not outfile.endswith(('.ct', '.counttable')):
            outfile += '.counttable'
        sketch.save(outfile)
        message += ';\n    saved to "{:s}"'.format(outfile)
    print('[kevlar::count]    ', message, file=logfile)

    return sketch
Esempio n. 19
0
def test_kmer_revcom_hash(kmer):
    a = khmer.Counttable(21, 1e4, 3)
    assert a.hash(kmer) == a.hash(khmer.reverse_complement(kmer))