Esempio n. 1
0
def buildIndex(K, inputs, output):
    """
    Create a new k-mer index. The FASTA files named in the list
    `inputs` are read in and the `K` length k-mers and their reverse
    complements are extracted and collated to create an index that
    maps from k-mer to sequence number (numbering from 0). The
    `names` member of the KmerIndex object can be used to retrieve
    the name from the sequence number.
    """
    seqs = []
    for inp in inputs:
        with openFile(inp) as f:
            seqs += list(readFasta(f))

    S = []
    nms = []
    lens = array.array('I', [])
    for i in xrange(len(seqs)):
        (nm, seq) = seqs[i]
        nms.append(nm)
        xs = list(kmers(K, seq, True))
        xs.sort()
        uniq(xs)
        seqs[i] = [nm, xs]
        lens.append(len(xs))
        S += xs
    S.sort()
    uniq(S)
    S = sparse(2*K, S)

    T = array.array('I', [0 for i in xrange(S.count() + 1)])
    for i in xrange(len(seqs)):
        for x in seqs[i][1]:
            r = S.rank(x)
            T[r] += 1

    t0 = 0
    for i in xrange(len(T)):
        t1 = t0 + T[i]
        T[i] = t0
        t0 = t1

    T0 = [c for c in T]
    U = array.array('H', [0 for i in xrange(t0)])
    for i in xrange(len(seqs)):
        for x in seqs[i][1]:
            r = S.rank(x)
            U[T0[r]] = i
            T0[r] += 1

    with container(output, 'w') as z:
        writeKmers(K, S.xs, z)
        n = write32(z, T, 'offsets')
        z.meta['T'] = n
        n = write16(z, U, 'postings')
        z.meta['U'] = n
        n = write32(z, lens, 'lens')
        z.meta['lens'] = n
        z.meta['names'] = nms
Esempio n. 2
0
def test_uniq_few():
    xs = [1, 1, 2, 3, 3, 3, 4]
    misc.uniq(xs)
    assert len(xs) == 4
    assert xs[0] == 1
    assert xs[1] == 2
    assert xs[2] == 3
    assert xs[3] == 4
Esempio n. 3
0
def test_uniq_simple():
    xs = [i for i in xrange(10)]
    print xs
    misc.uniq(xs)
    print xs
    assert len(xs) == 10
    for i in xrange(10):
        assert xs[i] == i
Esempio n. 4
0
def test_uniq_many():
    xs = []
    ys = set([])
    random.seed(17)
    N = 100000
    for i in xrange(N):
        x = random.randint(0, 1000)
        xs.append(x)
        ys.add(x)
    ys = list(ys)
    ys.sort()
    xs.sort()
    misc.uniq(xs)
    assert len(xs) == len(ys)
    for i in xrange(len(xs)):
        assert xs[i] == ys[i]
Esempio n. 5
0
def neigh(K, x, d):
    if d == 0:
        return []
    xs = []
    for i in xrange(K):
        for j in xrange(3):
            xs.append(x ^ ((j + 1) << (2 * i)))
    xs.sort()

    if d == 1:
        return xs

    zs = []
    for y in xs:
        zs += neigh(K, y, d - 1)
    zs.sort()
    uniq(zs)
    diff(zs, [x])
    diff(zs, xs)
    return zs
Esempio n. 6
0
from pykmer.basics import kmers
from pykmer.file import readFasta
from pykmer.misc import uniq
import pykmer.kset as kset

import sys

if len(sys.argv) < 4:
    print >> sys.stderr, "usage: make-kset.py <K> <output-filename> <input-FASTA>...."
    sys.exit(1)

K = int(sys.argv[1])

xs = []
for fn in sys.argv[3:]:
    with open(fn) as f:
        for (nm, seq) in readFasta(f):
            xs += list(kmers(K, seq, True))

xs.sort()
uniq(xs)
kset.write(K, xs, sys.argv[2])
Esempio n. 7
0
def test_uniq_empty():
    xs = []
    misc.uniq(xs)
    assert len(xs) == 0
Esempio n. 8
0
def test_uniq_singleton():
    xs = [1]
    misc.uniq(xs)
    assert len(xs) == 1
    assert xs[0] == 1