def buildIndex(K, inputs, output): """ Create a new k-mer index. The FASTA files named in the list `inputs` are read in and the `K` length k-mers and their reverse complements are extracted and collated to create an index that maps from k-mer to sequence number (numbering from 0). The `names` member of the KmerIndex object can be used to retrieve the name from the sequence number. """ seqs = [] for inp in inputs: with openFile(inp) as f: seqs += list(readFasta(f)) S = [] nms = [] lens = array.array('I', []) for i in xrange(len(seqs)): (nm, seq) = seqs[i] nms.append(nm) xs = list(kmers(K, seq, True)) xs.sort() uniq(xs) seqs[i] = [nm, xs] lens.append(len(xs)) S += xs S.sort() uniq(S) S = sparse(2*K, S) T = array.array('I', [0 for i in xrange(S.count() + 1)]) for i in xrange(len(seqs)): for x in seqs[i][1]: r = S.rank(x) T[r] += 1 t0 = 0 for i in xrange(len(T)): t1 = t0 + T[i] T[i] = t0 t0 = t1 T0 = [c for c in T] U = array.array('H', [0 for i in xrange(t0)]) for i in xrange(len(seqs)): for x in seqs[i][1]: r = S.rank(x) U[T0[r]] = i T0[r] += 1 with container(output, 'w') as z: writeKmers(K, S.xs, z) n = write32(z, T, 'offsets') z.meta['T'] = n n = write16(z, U, 'postings') z.meta['U'] = n n = write32(z, lens, 'lens') z.meta['lens'] = n z.meta['names'] = nms
def test_uniq_few(): xs = [1, 1, 2, 3, 3, 3, 4] misc.uniq(xs) assert len(xs) == 4 assert xs[0] == 1 assert xs[1] == 2 assert xs[2] == 3 assert xs[3] == 4
def test_uniq_simple(): xs = [i for i in xrange(10)] print xs misc.uniq(xs) print xs assert len(xs) == 10 for i in xrange(10): assert xs[i] == i
def test_uniq_many(): xs = [] ys = set([]) random.seed(17) N = 100000 for i in xrange(N): x = random.randint(0, 1000) xs.append(x) ys.add(x) ys = list(ys) ys.sort() xs.sort() misc.uniq(xs) assert len(xs) == len(ys) for i in xrange(len(xs)): assert xs[i] == ys[i]
def neigh(K, x, d): if d == 0: return [] xs = [] for i in xrange(K): for j in xrange(3): xs.append(x ^ ((j + 1) << (2 * i))) xs.sort() if d == 1: return xs zs = [] for y in xs: zs += neigh(K, y, d - 1) zs.sort() uniq(zs) diff(zs, [x]) diff(zs, xs) return zs
from pykmer.basics import kmers from pykmer.file import readFasta from pykmer.misc import uniq import pykmer.kset as kset import sys if len(sys.argv) < 4: print >> sys.stderr, "usage: make-kset.py <K> <output-filename> <input-FASTA>...." sys.exit(1) K = int(sys.argv[1]) xs = [] for fn in sys.argv[3:]: with open(fn) as f: for (nm, seq) in readFasta(f): xs += list(kmers(K, seq, True)) xs.sort() uniq(xs) kset.write(K, xs, sys.argv[2])
def test_uniq_empty(): xs = [] misc.uniq(xs) assert len(xs) == 0
def test_uniq_singleton(): xs = [1] misc.uniq(xs) assert len(xs) == 1 assert xs[0] == 1