Exemple #1
0
    def test_consume(self):
        return  # @CTB
        kt = self.kt

        # consume a test string, and verify that consume works.
        s = "ATGAGAGACACAGGGAGAGACCCAATTAGAGAATTGGACC"
        kt.consume(s)

        kt2 = khmer.new_ktable(L)

        for start in range(0, len(s) - L + 1):
            word = s[start:start + L]

            kt2.count(word)

        for i in range(0, kt.n_entries()):
            n = kt.get(i)  # test 'consume_str' numbers
            n3 = kt2.get(i)  # and 'count' count.
            assert n == n3

        for i in range(0, kt.n_entries()):
            kt.set(i, 1)

        for i in range(0, kt.n_entries()):
            assert (kt.get(i) == 1)
Exemple #2
0
    def test_consume(self):
        return                          # @CTB
        kt = self.kt

        # consume a test string, and verify that consume works.
        s = "ATGAGAGACACAGGGAGAGACCCAATTAGAGAATTGGACC"
        kt.consume(s)

        kt2 = khmer.new_ktable(L)

        for start in range(0, len(s) - L + 1):
            word = s[start:start + L]

            kt2.count(word)

        for i in range(0, kt.n_entries()):
            n = kt.get(i)                       # test 'consume_str' numbers
            n3 = kt2.get(i)                     # and 'count' count.
            assert n == n3

        for i in range(0, kt.n_entries()):
            kt.set(i, 1)

        for i in range(0, kt.n_entries()):
            assert(kt.get(i) == 1)
Exemple #3
0
    def test_collision_2(self):
        return                          # @CTB
        kt = khmer.new_ktable(10)
        
        GG = 'G' * 10                   # forward_hash: 1048575
        assert kt.forward_hash(GG) == 1048575

        collision_1 = 'AACGGTCGGA'      # forward_hash: 48572
        assert kt.forward_hash(collision_1) == 48572

        collision_2 = 'AACTTGTTAC'      # forward_hash: 38738
        assert kt.forward_hash(collision_2) == 38738

        # note, hash(GG) % 1000003 == hash(collision_1)
        # note, hash(GG) % 1009837 == hash(collision_2)

        hi = self.hi
        hi.consume(GG)
        hi.consume(collision_2)

        assert hi._kh1.get(GG) == 1
        assert hi._kh2.get(GG) == 2

        assert hi.get_min_count(GG) == 1
        assert hi.get_max_count(GG) == 1
Exemple #4
0
def test_complete_no_collision():
    kh = khmer.new_hashtable(4, 4**4)
    kt = khmer.new_ktable(4)

    for i in range(0, kt.n_entries()):
        s = kt.reverse_hash(i)
        kh.count(s)

    n_palindromes = 0
    n_rc_filled = 0
    n_fwd_filled = 0
    
    for i in range(0, kt.n_entries()):
        s = kt.reverse_hash(i)
        if kh.get(s):                   # string hashing is rc aware
            n_rc_filled += 1
        if kh.get(s) == 1:              # palindromes are singular
            n_palindromes += 1
        if kh.get(i):                   # int hashing is not rc aware
            n_fwd_filled += 1

    assert n_rc_filled == kt.n_entries(),  n_rc_filled
    assert n_palindromes == 16, n_palindromes # @CTB check this
    assert n_fwd_filled == kt.n_entries() / 2 + n_palindromes / 2, \
           n_fwd_filled
Exemple #5
0
def enumerate_kmers(kmer_len):
    """
    Return all kmers as strings.
    """
    ktable = khmer.new_ktable(kmer_len)
    kmers = [ktable.reverse_hash(n) \
             for n in range(ktable.n_entries())]
    return kmers
Exemple #6
0
def enumerate_kmers(kmer_len):
    """
    Return all kmers as strings.
    """
    ktable = khmer.new_ktable(kmer_len)
    kmers = [ktable.reverse_hash(n) \
             for n in range(ktable.n_entries())]
    return kmers
def process_file(filename, HT_SIZE_array):

    N_HT = 4
    K = 12

    list_average_miscount = []
    list_average_miscount_perc = []
    list_fp_miscount0 = []

    print filename
    for HT_SIZE in HT_SIZE_array:
        print HT_SIZE
        ht = khmer.new_counting_hash(K, HT_SIZE, N_HT)
        ht.consume_fasta(filename)

        ktable = khmer.new_ktable(K)
        f = screed.open(filename)
        for record in f:
            sequence = record['sequence']
            ktable.consume(sequence)

        list_miscount = []
        list_miscount_perc = []
        total_kmer = 0  # total number of unique k-mers
        miscount0 = 0

        for i in range(0, ktable.n_entries()):
            n = ktable.get(i)
            if n:
                total_kmer = total_kmer + 1
                kmer2 = ktable.reverse_hash(i)
                miscount = ht.get(kmer2) - ktable.get(kmer2)  ######
                #                if ht.get(kmer2)<ktable.get(kmer2):
                #                    print kmer2,ht.get(kmer2),ktable.get(kmer2)
                miscount_perc = miscount / ktable.get(kmer2)
                list_miscount.append(miscount)
                list_miscount_perc.append(miscount_perc)
                if miscount > 0:
                    miscount0 = miscount0 + 1

        average_miscount = float(sum(list_miscount)) / len(list_miscount)
        list_average_miscount.append(average_miscount)
        average_miscount_perc = float(
            sum(list_miscount_perc)) / len(list_miscount_perc)
        list_average_miscount_perc.append(average_miscount_perc)

        fp_miscount0 = float(miscount0) / total_kmer
        list_fp_miscount0.append(fp_miscount0)

    to_return = [
        list_average_miscount, list_fp_miscount0, total_kmer,
        list_average_miscount_perc
    ]
    return to_return
def count_kmers_and_reads(in_fastq, kmer_size):
    ktable = khmer.new_ktable(kmer_size)
    read_count = collections.defaultdict(int)
    with open(in_fastq) as in_handle:
        i = 0
        for (_, seq, _) in FastqGeneralIterator(in_handle):
            i += 1
            #if i > 1e5: break
            if seq.find("N") == -1:
                ktable.consume(seq)
                read_count[seq] += 1
    return ktable, dict(read_count)
def count_kmers_and_reads(in_fastq, kmer_size):
    ktable = khmer.new_ktable(kmer_size)
    read_count = collections.defaultdict(int)
    with open(in_fastq) as in_handle:
        i = 0
        for (_, seq, _) in FastqGeneralIterator(in_handle):
            i += 1
            #if i > 1e5: break
            if seq.find("N") == -1:
                ktable.consume(seq)
                read_count[seq] += 1
    return ktable, dict(read_count)
def process_file(filename,HT_SIZE_array):

    N_HT = 4
    K = 12

    list_average_miscount = []
    list_average_miscount_perc = []
    list_fp_miscount0 = []

    print filename
    for HT_SIZE in HT_SIZE_array:
        print HT_SIZE
        ht = khmer.new_counting_hash(K, HT_SIZE, N_HT)
        ht.consume_fasta(filename)
                
        ktable = khmer.new_ktable(K)
        for n, record in enumerate(fasta_iter(open(filename))):
            sequence = record['sequence']
#            ktable.consume(sequence)

            seq_len = len(sequence)
            for n in range(0,seq_len+1-K):
                kmer = sequence[n:n+K]
                ktable.count(kmer)

        list_miscount = []
        list_miscount_perc = []
        total_kmer = 0 # total number of unique k-mers
        miscount0 = 0
        
        for i in range(0, ktable.n_entries()):
            n = ktable.get(i)
            if n:
                total_kmer = total_kmer + 1
                kmer2 = ktable.reverse_hash(i)
                miscount = ht.get(kmer2) - ktable.get(kmer2)######
                miscount_perc = miscount/ktable.get(kmer2)
                list_miscount.append(miscount)
                list_miscount_perc.append(miscount_perc)
                if miscount > 0:
                    miscount0 = miscount0 + 1

        average_miscount = float(sum(list_miscount))/len(list_miscount)
        list_average_miscount.append(average_miscount)
        average_miscount_perc = float(sum(list_miscount_perc))/len(list_miscount_perc)
        list_average_miscount_perc.append(average_miscount_perc)
        
        fp_miscount0 = float(miscount0)/total_kmer
        list_fp_miscount0.append(fp_miscount0)

    to_return = [list_average_miscount,list_fp_miscount0,total_kmer,list_average_miscount_perc]
    return to_return
Exemple #11
0
def load_fastx_into_ktable(fastx_fname, kmer_len):
    t1 = time.time()
    ktable = khmer.new_ktable(kmer_len)
    # Load up the FASTA into ktable
    for fastx_entry in fastx_utils.get_fastx_entries(fastx_fname):
        fastx_name, fastx_seq = fastx_entry
        # Skip very short sequences
        if len(fastx_seq) < kmer_len:
            continue
        ktable.consume(fastx_seq)
    t2 = time.time()
    print "Loading up of seqs into ktable took %.2f seconds." %(t2 - t1)
    return ktable
Exemple #12
0
def load_fastx_into_ktable(fastx_fname, kmer_len):
    t1 = time.time()
    ktable = khmer.new_ktable(kmer_len)
    # Load up the FASTA into ktable
    for fastx_entry in fastx_utils.get_fastx_entries(fastx_fname):
        fastx_name, fastx_seq = fastx_entry
        # Skip very short sequences
        if len(fastx_seq) < kmer_len:
            continue
        ktable.consume(fastx_seq)
    t2 = time.time()
    print "Loading up of seqs into ktable took %.2f seconds." % (t2 - t1)
    return ktable
Exemple #13
0
    def test_update(self):
        kt = self.kt

        # intersection
        for i in range(0, 4**L / 4):
            kt.set(i * 4, 1)

        kt2 = khmer.new_ktable(L)
        for i in range(0, 4**L / 5):
            kt2.set(i * 5, 1)

        kt.update(kt2)
        for i in range(0, 4**L):
            if kt.get(i):
                assert i % 4 == 0 or i % 5 == 0
Exemple #14
0
    def test_update(self):
        kt = self.kt

        # intersection
        for i in range(0, 4 ** L / 4):
            kt.set(i * 4, 1)

        kt2 = khmer.new_ktable(L)
        for i in range(0, 4 ** L / 5):
            kt2.set(i * 5, 1)

        kt.update(kt2)
        for i in range(0, 4 ** L):
            if kt.get(i):
                assert i % 4 == 0 or i % 5 == 0
Exemple #15
0
def calc_tetra(seqs):

	ktable = khmer.new_ktable(4)

	for seq in seqs:
		ktable.consume(str(seq.seq))

	tetramers = {}
	for a in ['A', 'C', 'G', 'T']:
		for b in ['A', 'C', 'G', 'T']:
			for c in ['A', 'C', 'G', 'T']:
				for d in ['A', 'C', 'G', 'T']:
					tetramers[a+b+c+d] = 0
	for t in tetramers.keys():
		tetramers[t] = int(ktable.get(t))
	print tetramers
def main(inp_name, outp_name, min_seq_len):
    outfp = open(outp_name, 'w')

    min_seq_len = int(min_seq_len)

    for record in screed.open(inp_name):
        if len(record.sequence) < min_seq_len:
            continue

        kt = khmer.new_ktable(KSIZE)
        kt.consume(record.sequence)

        x = []
        for i in range(4**KSIZE):
            x.append("%s" % (kt.get(i), ))
        vector = " ".join(x)
        print >> outfp, "%s %s" % (record.name.split('\t')[0], vector)
Exemple #17
0
def main(inp_name, outp_name, min_seq_len):
    outfp = open(outp_name, 'w')

    min_seq_len = int(min_seq_len)
    
    for record in screed.open(inp_name):
        if len(record.sequence) < min_seq_len:
            continue
        
        kt = khmer.new_ktable(KSIZE)
        kt.consume(record.sequence)

        x = []
        for i in range(4**KSIZE):
            x.append("%s" % (kt.get(i),))
        vector = " ".join(x)
        print >>outfp, "%s %s" % (record.name.split('\t')[0], vector)
Exemple #18
0
    def test_intersection(self):
        kt = self.kt

        # intersection
        for i in range(0, 4 ** L / 4):
            kt.set(i * 4, 1)

        kt2 = khmer.new_ktable(L)
        for i in range(0, 4 ** L / 5):
            kt2.set(i * 5, 1)

        kt3 = kt.intersect(kt2)

        assert kt3.get(20) == 2
        for i in range(0, 4 ** L):
            if kt3.get(i):
                assert i % 4 == 0
                assert i % 5 == 0
Exemple #19
0
    def test_intersection(self):
        kt = self.kt

        # intersection
        for i in range(0, 4**L / 4):
            kt.set(i * 4, 1)

        kt2 = khmer.new_ktable(L)
        for i in range(0, 4**L / 5):
            kt2.set(i * 5, 1)

        kt3 = kt.intersect(kt2)

        assert kt3.get(20) == 2
        for i in range(0, 4**L):
            if kt3.get(i):
                assert i % 4 == 0
                assert i % 5 == 0
Exemple #20
0
def test_complete_4_collision():
    kh = khmer.new_hashtable(4, 4**4 / 4)
    kt = khmer.new_ktable(4)

    for i in range(0, kt.n_entries()):
        s = kt.reverse_hash(i)
        kh.count(s)

    n_rc_filled = 0
    n_fwd_filled = 0
    
    for i in range(0, 64):
        s = kt.reverse_hash(i)
        if kh.get(s):                   # string hashing is rc aware
            n_rc_filled += 1
        if kh.get(i):                   # int hashing is not rc aware
            n_fwd_filled += 1

    assert n_rc_filled == 64,  n_rc_filled
Exemple #21
0
    def test_collision_2(self):
        kt = khmer.new_ktable(12)
        
        GG = 'G' * 12                   # forward_hash: 11184810
        assert khmer.forward_hash(GG, 12) == 11184810

        collision_1 = 'AAACGTATGACT'
        assert khmer.forward_hash(collision_1, 12) == 184777L

        collision_2 = 'AAATACCGAGCG'
        assert khmer.forward_hash(collision_2, 12) == 76603L

        # hash(GG) % 1000003 == hash(collision_1)
        # hash(GG) % 1009837 == hash(collision_2)

        hi = self.hi
        hi.consume(GG)
        hi.consume(collision_2)

        assert hi.get(GG) == 1
Exemple #22
0
    def test_collision_2(self):
        kt = khmer.new_ktable(12)
        
        GG = 'G' * 12                   # forward_hash: 11184810
        assert khmer.forward_hash(GG, 12) == 11184810

        collision_1 = 'AAACGTATGACT'
        assert khmer.forward_hash(collision_1, 12) == 184777L

        collision_2 = 'AAATACCGAGCG'
        assert khmer.forward_hash(collision_2, 12) == 76603L

        # hash(GG) % 1000003 == hash(collision_1)
        # hash(GG) % 1009837 == hash(collision_2)

        hi = self.hi
        hi.consume(GG)
        hi.consume(collision_2)

        assert hi.get(GG) == 1
Exemple #23
0
def makeKmerArray(screedb,ksize,normalize):
	"""
	This takes a screedb file and a k-mer size in inputs,
	and print the ktable.
	"""
	ktable = khmer.new_ktable(ksize)
	knames=[]
	print str(ksize) + "-mer" + sep,
	for i in range(0, ktable.n_entries()):
		knames.append(ktable.reverse_hash(i))
	print sep.join(knames)
    	
	if norm:
		for record in screedb.itervalues():
			tot=float()
			kmers=[]
			ktable.clear()
			ktable.consume(str(record.sequence))
			print record.name,
			for i in range(0, ktable.n_entries()):
				kmers.append(ktable.get(i))
				tot=tot+ktable.get(i)
			#print kmers
			kmersNorm = [float(x)/tot for x in kmers]
			#print kmersNorm
			for ele in kmersNorm:
				sys.stdout.write(sep + '%f' % (ele))
			print


	else:
		for record in screedb.itervalues():
			ktable.clear()
			ktable.consume(str(record.sequence))
			sys.stdout.write(record.name)
			for i in range(0, ktable.n_entries()):
			    sys.stdout.write(sep + str(ktable.get(i)))
			print
Exemple #24
0
def makeKmerArray(screedb, ksize, normalize):
    """
	This takes a screedb file and a k-mer size in inputs,
	and print the ktable.
	"""
    ktable = khmer.new_ktable(ksize)
    knames = []
    print str(ksize) + "-mer" + sep,
    for i in range(0, ktable.n_entries()):
        knames.append(ktable.reverse_hash(i))
    print sep.join(knames)

    if norm:
        for record in screedb.itervalues():
            tot = float()
            kmers = []
            ktable.clear()
            ktable.consume(str(record.sequence))
            print record.name,
            for i in range(0, ktable.n_entries()):
                kmers.append(ktable.get(i))
                tot = tot + ktable.get(i)
            #print kmers
            kmersNorm = [float(x) / tot for x in kmers]
            #print kmersNorm
            for ele in kmersNorm:
                sys.stdout.write(sep + '%f' % (ele))
            print

    else:
        for record in screedb.itervalues():
            ktable.clear()
            ktable.consume(str(record.sequence))
            sys.stdout.write(record.name)
            for i in range(0, ktable.n_entries()):
                sys.stdout.write(sep + str(ktable.get(i)))
            print
Exemple #25
0
K = 12                                  # size of K
N = 25000                               # 1/4 the size of the genome
P_ERROR = .01                           # per-base probability of error

###

# construct a random genome
genome = "A"*N + "C"*N + "G"*N + "T"*N
genome = list(genome)
random.shuffle(genome)
genome = "".join(genome)

###

# count the number of unique k-mers
kt = khmer.new_ktable(K)
kt.consume(genome)

total = 0
for i in range(0, 4**K):
    if kt.get(i):
        total += 1

print >> sys.stderr, "%d unique k-mers in genome" % total

###

# go through, sample with replacement and mutation, and calculate
# number of novel k-mers picked as a function of sampling.

kt = khmer.new_ktable(K)
Exemple #26
0
 def setup(self):
     # make a new ktable.
     self.kt = khmer.new_ktable(L)
Exemple #27
0
            return round(estimate)

        
    def _bitscan(self, x, m):
        v = 1
        while v<=m and not x&0x80000000:
            v+=1
            x<<=1
        return v


if __name__ == '__main__':
    
    k=12

    kt=khmer.new_ktable(k)

    alphabet={0:'A',1:'T',2:'G',3:'C'}

    given_string=''

    for i in range(1000):
        given_string+=alphabet[random.randint(0,3)]
    
    n=kt.consume(given_string)
    H = HyperLogLog(8)


    for i in range(n):
        H.add(given_string[i:i+k],k)
        
from threading import Thread
import time
import khmer

class ConsumeThread(Thread):
    def __init__(self, wordsize, genome):
        self.wordsize = wordsize
        self.genome = genome
        Thread.__init__(self)

    def run(self):
        self.kt  = khmer.consume_genome(self.wordsize, self.genome)

genome = open('/tmp/all2.dna').read()

nthreads = 2
length = len(genome)
genome1 = genome[:length/2]
genome2 = genome[length/2:]
t1 = ConsumeThread(5, genome1)
t2 = ConsumeThread(5, genome2)

t1.start()
t2.start()
t1.join()
t2.join()

master_kt = khmer.new_ktable(5)
master_kt.update(t1.kt)
master_kt.update(t2.kt)
Exemple #29
0
 def setup(self):
     # make a new ktable.
     self.kt = khmer.new_ktable(L)
Exemple #30
0
    ht = khmer.new_counting_hash(K, HT_SIZE, N_HT)

    unique_kmer = []
    for n, record in enumerate(fasta_iter(open(filename))):
        sequence = record['sequence']
        seq_len = len(sequence)
        for n in range(0,seq_len+1-K):
            kmer = sequence[n:n+K]
            if (not ht.get(kmer)):
               unique_kmer.append(kmer) 
            ht.count(kmer)

#for kmer2 in unique_kmer:
#    print kmer2,ht.get(kmer2)
            
    ktable = khmer.new_ktable(K)
    for n, record in enumerate(fasta_iter(open(filename))):
        sequence = record['sequence']
        seq_len = len(sequence)
        for n in range(0,seq_len+1-K):
            kmer = sequence[n:n+K]
            ktable.count(kmer)

    false = 0
    all_n = 1
    for kmer2 in unique_kmer:
        if ht.get(kmer2) != ktable.get(kmer2):
	    false = false+1
        all_n = all_n +1
    fp = false*100.0/all_n
    
Exemple #31
0
import sys
sys.path.insert(0, 'build/lib.linux-i686-2.3/')

import khmer
ktable = khmer.new_ktable(6)
ktable.consume("ATGAGAGACACAGGGAGAGACCCAATTAGAGAATTGGACC")
for i in range(0, ktable.n_entries()):
    n = ktable.get(i)
    if n:
        print ktable.reverse_hash(i), "is present", n, "time(s)."
Exemple #32
0
import khmer


class ConsumeThread(Thread):
    def __init__(self, wordsize, genome):
        self.wordsize = wordsize
        self.genome = genome
        Thread.__init__(self)

    def run(self):
        self.kt = khmer.consume_genome(self.wordsize, self.genome)


genome = open('/tmp/all2.dna').read()

nthreads = 2
length = len(genome)
genome1 = genome[:length / 2]
genome2 = genome[length / 2:]
t1 = ConsumeThread(5, genome1)
t2 = ConsumeThread(5, genome2)

t1.start()
t2.start()
t1.join()
t2.join()

master_kt = khmer.new_ktable(5)
master_kt.update(t1.kt)
master_kt.update(t2.kt)