def buildIndex(K, inputs, output): """ Create a new k-mer index. The FASTA files named in the list `inputs` are read in and the `K` length k-mers and their reverse complements are extracted and collated to create an index that maps from k-mer to sequence number (numbering from 0). The `names` member of the KmerIndex object can be used to retrieve the name from the sequence number. """ seqs = [] for inp in inputs: with openFile(inp) as f: seqs += list(readFasta(f)) S = [] nms = [] lens = array.array('I', []) for i in xrange(len(seqs)): (nm, seq) = seqs[i] nms.append(nm) xs = list(kmers(K, seq, True)) xs.sort() uniq(xs) seqs[i] = [nm, xs] lens.append(len(xs)) S += xs S.sort() uniq(S) S = sparse(2*K, S) T = array.array('I', [0 for i in xrange(S.count() + 1)]) for i in xrange(len(seqs)): for x in seqs[i][1]: r = S.rank(x) T[r] += 1 t0 = 0 for i in xrange(len(T)): t1 = t0 + T[i] T[i] = t0 t0 = t1 T0 = [c for c in T] U = array.array('H', [0 for i in xrange(t0)]) for i in xrange(len(seqs)): for x in seqs[i][1]: r = S.rank(x) U[T0[r]] = i T0[r] += 1 with container(output, 'w') as z: writeKmers(K, S.xs, z) n = write32(z, T, 'offsets') z.meta['T'] = n n = write16(z, U, 'postings') z.meta['U'] = n n = write32(z, lens, 'lens') z.meta['lens'] = n z.meta['names'] = nms
def test_kmers(): s = "CCTCGTACGCCATATTTTCGCATTTCACGTACGTATTGTTTTTGCAACATAATTACCTATTCTCTTTTGGGGGGGGTTTTAGGCATTCCATTTAATNGCTTTTCTTTTAATGCATGGAGTTTTTCCCATTCATCCTTTGATATATTATCTTTACTTGCTTCGAAGTCTNTTGCTGTGAGATGTATATCTTCTGGATGGATTTGTTTACGTTCTTTTGTTACTGGATCTATAGTAAATGGAATCATTTCCTT" k = 25 xs = list(basics.kmers(k, s, False)) ys = [] for i in range(len(s) - k + 1): y = basics.kmer(s[i:i + k]) if y is not None: ys.append(y) assert xs == ys
from pykmer.basics import kmers from pykmer.file import readFasta from pykmer.misc import uniq import pykmer.kset as kset import sys if len(sys.argv) < 4: print >> sys.stderr, "usage: make-kset.py <K> <output-filename> <input-FASTA>...." sys.exit(1) K = int(sys.argv[1]) xs = [] for fn in sys.argv[3:]: with open(fn) as f: for (nm, seq) in readFasta(f): xs += list(kmers(K, seq, True)) xs.sort() uniq(xs) kset.write(K, xs, sys.argv[2])
# file. # xs = {} with gzip.open(sys.argv[2]) as f: # Now parse the FASTA file, to get the <name, sequence> pairs. for (nm, seq) in readFasta(f): # From each sequence extract k-mers. # # Note that the kmers() function is a generator - it uses # yield to return each k-mer in turn rather than composing # them in to a list or a set. # # The third argument is a boolean flag that indicates whether # or not to return reverse complement versions of the k-mers # as well as the "forward" ones in the sequence. for x in kmers(K, seq, True): # If this is the first instance of this k-mer # initialize an entry in the dict. if x not in xs: xs[x] = 0 # Add this k-mer instance xs[x] += 1 # Step 2 # Iterate over the dictionary, and compile the k-mer frequency # histogram. # hist = {} for (x,f) in xs.iteritems(): if f not in hist: hist[f] = 0