Esempio n. 1
0
def buildIndex(K, inputs, output):
    """
    Create a new k-mer index. The FASTA files named in the list
    `inputs` are read in and the `K` length k-mers and their reverse
    complements are extracted and collated to create an index that
    maps from k-mer to sequence number (numbering from 0). The
    `names` member of the KmerIndex object can be used to retrieve
    the name from the sequence number.
    """
    seqs = []
    for inp in inputs:
        with openFile(inp) as f:
            seqs += list(readFasta(f))

    S = []
    nms = []
    lens = array.array('I', [])
    for i in xrange(len(seqs)):
        (nm, seq) = seqs[i]
        nms.append(nm)
        xs = list(kmers(K, seq, True))
        xs.sort()
        uniq(xs)
        seqs[i] = [nm, xs]
        lens.append(len(xs))
        S += xs
    S.sort()
    uniq(S)
    S = sparse(2*K, S)

    T = array.array('I', [0 for i in xrange(S.count() + 1)])
    for i in xrange(len(seqs)):
        for x in seqs[i][1]:
            r = S.rank(x)
            T[r] += 1

    t0 = 0
    for i in xrange(len(T)):
        t1 = t0 + T[i]
        T[i] = t0
        t0 = t1

    T0 = [c for c in T]
    U = array.array('H', [0 for i in xrange(t0)])
    for i in xrange(len(seqs)):
        for x in seqs[i][1]:
            r = S.rank(x)
            U[T0[r]] = i
            T0[r] += 1

    with container(output, 'w') as z:
        writeKmers(K, S.xs, z)
        n = write32(z, T, 'offsets')
        z.meta['T'] = n
        n = write16(z, U, 'postings')
        z.meta['U'] = n
        n = write32(z, lens, 'lens')
        z.meta['lens'] = n
        z.meta['names'] = nms
Esempio n. 2
0
def test_kmers():
    s = "CCTCGTACGCCATATTTTCGCATTTCACGTACGTATTGTTTTTGCAACATAATTACCTATTCTCTTTTGGGGGGGGTTTTAGGCATTCCATTTAATNGCTTTTCTTTTAATGCATGGAGTTTTTCCCATTCATCCTTTGATATATTATCTTTACTTGCTTCGAAGTCTNTTGCTGTGAGATGTATATCTTCTGGATGGATTTGTTTACGTTCTTTTGTTACTGGATCTATAGTAAATGGAATCATTTCCTT"
    k = 25
    xs = list(basics.kmers(k, s, False))
    ys = []
    for i in range(len(s) - k + 1):
        y = basics.kmer(s[i:i + k])
        if y is not None:
            ys.append(y)
    assert xs == ys
Esempio n. 3
0
from pykmer.basics import kmers
from pykmer.file import readFasta
from pykmer.misc import uniq
import pykmer.kset as kset

import sys

if len(sys.argv) < 4:
    print >> sys.stderr, "usage: make-kset.py <K> <output-filename> <input-FASTA>...."
    sys.exit(1)

K = int(sys.argv[1])

xs = []
for fn in sys.argv[3:]:
    with open(fn) as f:
        for (nm, seq) in readFasta(f):
            xs += list(kmers(K, seq, True))

xs.sort()
uniq(xs)
kset.write(K, xs, sys.argv[2])
Esempio n. 4
0
# file.
#
xs = {}
with gzip.open(sys.argv[2]) as f:
    # Now parse the FASTA file, to get the <name, sequence> pairs.
    for (nm, seq) in readFasta(f):
        # From each sequence extract k-mers.
        #
        # Note that the kmers() function is a generator - it uses
        # yield to return each k-mer in turn rather than composing
        # them in to a list or a set.
        #
        # The third argument is a boolean flag that indicates whether
        # or not to return reverse complement versions of the k-mers
        # as well as the "forward" ones in the sequence.
        for x in kmers(K, seq, True):
            # If this is the first instance of this k-mer
            # initialize an entry in the dict.
            if x not in xs:
                xs[x] = 0
            # Add this k-mer instance
            xs[x] += 1

# Step 2
# Iterate over the dictionary, and compile the k-mer frequency
# histogram.
#
hist = {}
for (x,f) in xs.iteritems():
    if f not in hist:
        hist[f] = 0