def test_consume(self): return # @CTB kt = self.kt # consume a test string, and verify that consume works. s = "ATGAGAGACACAGGGAGAGACCCAATTAGAGAATTGGACC" kt.consume(s) kt2 = khmer.new_ktable(L) for start in range(0, len(s) - L + 1): word = s[start:start + L] kt2.count(word) for i in range(0, kt.n_entries()): n = kt.get(i) # test 'consume_str' numbers n3 = kt2.get(i) # and 'count' count. assert n == n3 for i in range(0, kt.n_entries()): kt.set(i, 1) for i in range(0, kt.n_entries()): assert (kt.get(i) == 1)
def test_consume(self): return # @CTB kt = self.kt # consume a test string, and verify that consume works. s = "ATGAGAGACACAGGGAGAGACCCAATTAGAGAATTGGACC" kt.consume(s) kt2 = khmer.new_ktable(L) for start in range(0, len(s) - L + 1): word = s[start:start + L] kt2.count(word) for i in range(0, kt.n_entries()): n = kt.get(i) # test 'consume_str' numbers n3 = kt2.get(i) # and 'count' count. assert n == n3 for i in range(0, kt.n_entries()): kt.set(i, 1) for i in range(0, kt.n_entries()): assert(kt.get(i) == 1)
def test_collision_2(self): return # @CTB kt = khmer.new_ktable(10) GG = 'G' * 10 # forward_hash: 1048575 assert kt.forward_hash(GG) == 1048575 collision_1 = 'AACGGTCGGA' # forward_hash: 48572 assert kt.forward_hash(collision_1) == 48572 collision_2 = 'AACTTGTTAC' # forward_hash: 38738 assert kt.forward_hash(collision_2) == 38738 # note, hash(GG) % 1000003 == hash(collision_1) # note, hash(GG) % 1009837 == hash(collision_2) hi = self.hi hi.consume(GG) hi.consume(collision_2) assert hi._kh1.get(GG) == 1 assert hi._kh2.get(GG) == 2 assert hi.get_min_count(GG) == 1 assert hi.get_max_count(GG) == 1
def test_complete_no_collision(): kh = khmer.new_hashtable(4, 4**4) kt = khmer.new_ktable(4) for i in range(0, kt.n_entries()): s = kt.reverse_hash(i) kh.count(s) n_palindromes = 0 n_rc_filled = 0 n_fwd_filled = 0 for i in range(0, kt.n_entries()): s = kt.reverse_hash(i) if kh.get(s): # string hashing is rc aware n_rc_filled += 1 if kh.get(s) == 1: # palindromes are singular n_palindromes += 1 if kh.get(i): # int hashing is not rc aware n_fwd_filled += 1 assert n_rc_filled == kt.n_entries(), n_rc_filled assert n_palindromes == 16, n_palindromes # @CTB check this assert n_fwd_filled == kt.n_entries() / 2 + n_palindromes / 2, \ n_fwd_filled
def enumerate_kmers(kmer_len): """ Return all kmers as strings. """ ktable = khmer.new_ktable(kmer_len) kmers = [ktable.reverse_hash(n) \ for n in range(ktable.n_entries())] return kmers
def process_file(filename, HT_SIZE_array): N_HT = 4 K = 12 list_average_miscount = [] list_average_miscount_perc = [] list_fp_miscount0 = [] print filename for HT_SIZE in HT_SIZE_array: print HT_SIZE ht = khmer.new_counting_hash(K, HT_SIZE, N_HT) ht.consume_fasta(filename) ktable = khmer.new_ktable(K) f = screed.open(filename) for record in f: sequence = record['sequence'] ktable.consume(sequence) list_miscount = [] list_miscount_perc = [] total_kmer = 0 # total number of unique k-mers miscount0 = 0 for i in range(0, ktable.n_entries()): n = ktable.get(i) if n: total_kmer = total_kmer + 1 kmer2 = ktable.reverse_hash(i) miscount = ht.get(kmer2) - ktable.get(kmer2) ###### # if ht.get(kmer2)<ktable.get(kmer2): # print kmer2,ht.get(kmer2),ktable.get(kmer2) miscount_perc = miscount / ktable.get(kmer2) list_miscount.append(miscount) list_miscount_perc.append(miscount_perc) if miscount > 0: miscount0 = miscount0 + 1 average_miscount = float(sum(list_miscount)) / len(list_miscount) list_average_miscount.append(average_miscount) average_miscount_perc = float( sum(list_miscount_perc)) / len(list_miscount_perc) list_average_miscount_perc.append(average_miscount_perc) fp_miscount0 = float(miscount0) / total_kmer list_fp_miscount0.append(fp_miscount0) to_return = [ list_average_miscount, list_fp_miscount0, total_kmer, list_average_miscount_perc ] return to_return
def count_kmers_and_reads(in_fastq, kmer_size): ktable = khmer.new_ktable(kmer_size) read_count = collections.defaultdict(int) with open(in_fastq) as in_handle: i = 0 for (_, seq, _) in FastqGeneralIterator(in_handle): i += 1 #if i > 1e5: break if seq.find("N") == -1: ktable.consume(seq) read_count[seq] += 1 return ktable, dict(read_count)
def process_file(filename,HT_SIZE_array): N_HT = 4 K = 12 list_average_miscount = [] list_average_miscount_perc = [] list_fp_miscount0 = [] print filename for HT_SIZE in HT_SIZE_array: print HT_SIZE ht = khmer.new_counting_hash(K, HT_SIZE, N_HT) ht.consume_fasta(filename) ktable = khmer.new_ktable(K) for n, record in enumerate(fasta_iter(open(filename))): sequence = record['sequence'] # ktable.consume(sequence) seq_len = len(sequence) for n in range(0,seq_len+1-K): kmer = sequence[n:n+K] ktable.count(kmer) list_miscount = [] list_miscount_perc = [] total_kmer = 0 # total number of unique k-mers miscount0 = 0 for i in range(0, ktable.n_entries()): n = ktable.get(i) if n: total_kmer = total_kmer + 1 kmer2 = ktable.reverse_hash(i) miscount = ht.get(kmer2) - ktable.get(kmer2)###### miscount_perc = miscount/ktable.get(kmer2) list_miscount.append(miscount) list_miscount_perc.append(miscount_perc) if miscount > 0: miscount0 = miscount0 + 1 average_miscount = float(sum(list_miscount))/len(list_miscount) list_average_miscount.append(average_miscount) average_miscount_perc = float(sum(list_miscount_perc))/len(list_miscount_perc) list_average_miscount_perc.append(average_miscount_perc) fp_miscount0 = float(miscount0)/total_kmer list_fp_miscount0.append(fp_miscount0) to_return = [list_average_miscount,list_fp_miscount0,total_kmer,list_average_miscount_perc] return to_return
def load_fastx_into_ktable(fastx_fname, kmer_len): t1 = time.time() ktable = khmer.new_ktable(kmer_len) # Load up the FASTA into ktable for fastx_entry in fastx_utils.get_fastx_entries(fastx_fname): fastx_name, fastx_seq = fastx_entry # Skip very short sequences if len(fastx_seq) < kmer_len: continue ktable.consume(fastx_seq) t2 = time.time() print "Loading up of seqs into ktable took %.2f seconds." %(t2 - t1) return ktable
def load_fastx_into_ktable(fastx_fname, kmer_len): t1 = time.time() ktable = khmer.new_ktable(kmer_len) # Load up the FASTA into ktable for fastx_entry in fastx_utils.get_fastx_entries(fastx_fname): fastx_name, fastx_seq = fastx_entry # Skip very short sequences if len(fastx_seq) < kmer_len: continue ktable.consume(fastx_seq) t2 = time.time() print "Loading up of seqs into ktable took %.2f seconds." % (t2 - t1) return ktable
def test_update(self): kt = self.kt # intersection for i in range(0, 4**L / 4): kt.set(i * 4, 1) kt2 = khmer.new_ktable(L) for i in range(0, 4**L / 5): kt2.set(i * 5, 1) kt.update(kt2) for i in range(0, 4**L): if kt.get(i): assert i % 4 == 0 or i % 5 == 0
def test_update(self): kt = self.kt # intersection for i in range(0, 4 ** L / 4): kt.set(i * 4, 1) kt2 = khmer.new_ktable(L) for i in range(0, 4 ** L / 5): kt2.set(i * 5, 1) kt.update(kt2) for i in range(0, 4 ** L): if kt.get(i): assert i % 4 == 0 or i % 5 == 0
def calc_tetra(seqs): ktable = khmer.new_ktable(4) for seq in seqs: ktable.consume(str(seq.seq)) tetramers = {} for a in ['A', 'C', 'G', 'T']: for b in ['A', 'C', 'G', 'T']: for c in ['A', 'C', 'G', 'T']: for d in ['A', 'C', 'G', 'T']: tetramers[a+b+c+d] = 0 for t in tetramers.keys(): tetramers[t] = int(ktable.get(t)) print tetramers
def main(inp_name, outp_name, min_seq_len): outfp = open(outp_name, 'w') min_seq_len = int(min_seq_len) for record in screed.open(inp_name): if len(record.sequence) < min_seq_len: continue kt = khmer.new_ktable(KSIZE) kt.consume(record.sequence) x = [] for i in range(4**KSIZE): x.append("%s" % (kt.get(i), )) vector = " ".join(x) print >> outfp, "%s %s" % (record.name.split('\t')[0], vector)
def main(inp_name, outp_name, min_seq_len): outfp = open(outp_name, 'w') min_seq_len = int(min_seq_len) for record in screed.open(inp_name): if len(record.sequence) < min_seq_len: continue kt = khmer.new_ktable(KSIZE) kt.consume(record.sequence) x = [] for i in range(4**KSIZE): x.append("%s" % (kt.get(i),)) vector = " ".join(x) print >>outfp, "%s %s" % (record.name.split('\t')[0], vector)
def test_intersection(self): kt = self.kt # intersection for i in range(0, 4 ** L / 4): kt.set(i * 4, 1) kt2 = khmer.new_ktable(L) for i in range(0, 4 ** L / 5): kt2.set(i * 5, 1) kt3 = kt.intersect(kt2) assert kt3.get(20) == 2 for i in range(0, 4 ** L): if kt3.get(i): assert i % 4 == 0 assert i % 5 == 0
def test_intersection(self): kt = self.kt # intersection for i in range(0, 4**L / 4): kt.set(i * 4, 1) kt2 = khmer.new_ktable(L) for i in range(0, 4**L / 5): kt2.set(i * 5, 1) kt3 = kt.intersect(kt2) assert kt3.get(20) == 2 for i in range(0, 4**L): if kt3.get(i): assert i % 4 == 0 assert i % 5 == 0
def test_complete_4_collision(): kh = khmer.new_hashtable(4, 4**4 / 4) kt = khmer.new_ktable(4) for i in range(0, kt.n_entries()): s = kt.reverse_hash(i) kh.count(s) n_rc_filled = 0 n_fwd_filled = 0 for i in range(0, 64): s = kt.reverse_hash(i) if kh.get(s): # string hashing is rc aware n_rc_filled += 1 if kh.get(i): # int hashing is not rc aware n_fwd_filled += 1 assert n_rc_filled == 64, n_rc_filled
def test_collision_2(self): kt = khmer.new_ktable(12) GG = 'G' * 12 # forward_hash: 11184810 assert khmer.forward_hash(GG, 12) == 11184810 collision_1 = 'AAACGTATGACT' assert khmer.forward_hash(collision_1, 12) == 184777L collision_2 = 'AAATACCGAGCG' assert khmer.forward_hash(collision_2, 12) == 76603L # hash(GG) % 1000003 == hash(collision_1) # hash(GG) % 1009837 == hash(collision_2) hi = self.hi hi.consume(GG) hi.consume(collision_2) assert hi.get(GG) == 1
def makeKmerArray(screedb,ksize,normalize): """ This takes a screedb file and a k-mer size in inputs, and print the ktable. """ ktable = khmer.new_ktable(ksize) knames=[] print str(ksize) + "-mer" + sep, for i in range(0, ktable.n_entries()): knames.append(ktable.reverse_hash(i)) print sep.join(knames) if norm: for record in screedb.itervalues(): tot=float() kmers=[] ktable.clear() ktable.consume(str(record.sequence)) print record.name, for i in range(0, ktable.n_entries()): kmers.append(ktable.get(i)) tot=tot+ktable.get(i) #print kmers kmersNorm = [float(x)/tot for x in kmers] #print kmersNorm for ele in kmersNorm: sys.stdout.write(sep + '%f' % (ele)) print else: for record in screedb.itervalues(): ktable.clear() ktable.consume(str(record.sequence)) sys.stdout.write(record.name) for i in range(0, ktable.n_entries()): sys.stdout.write(sep + str(ktable.get(i))) print
def makeKmerArray(screedb, ksize, normalize): """ This takes a screedb file and a k-mer size in inputs, and print the ktable. """ ktable = khmer.new_ktable(ksize) knames = [] print str(ksize) + "-mer" + sep, for i in range(0, ktable.n_entries()): knames.append(ktable.reverse_hash(i)) print sep.join(knames) if norm: for record in screedb.itervalues(): tot = float() kmers = [] ktable.clear() ktable.consume(str(record.sequence)) print record.name, for i in range(0, ktable.n_entries()): kmers.append(ktable.get(i)) tot = tot + ktable.get(i) #print kmers kmersNorm = [float(x) / tot for x in kmers] #print kmersNorm for ele in kmersNorm: sys.stdout.write(sep + '%f' % (ele)) print else: for record in screedb.itervalues(): ktable.clear() ktable.consume(str(record.sequence)) sys.stdout.write(record.name) for i in range(0, ktable.n_entries()): sys.stdout.write(sep + str(ktable.get(i))) print
K = 12 # size of K N = 25000 # 1/4 the size of the genome P_ERROR = .01 # per-base probability of error ### # construct a random genome genome = "A"*N + "C"*N + "G"*N + "T"*N genome = list(genome) random.shuffle(genome) genome = "".join(genome) ### # count the number of unique k-mers kt = khmer.new_ktable(K) kt.consume(genome) total = 0 for i in range(0, 4**K): if kt.get(i): total += 1 print >> sys.stderr, "%d unique k-mers in genome" % total ### # go through, sample with replacement and mutation, and calculate # number of novel k-mers picked as a function of sampling. kt = khmer.new_ktable(K)
def setup(self): # make a new ktable. self.kt = khmer.new_ktable(L)
return round(estimate) def _bitscan(self, x, m): v = 1 while v<=m and not x&0x80000000: v+=1 x<<=1 return v if __name__ == '__main__': k=12 kt=khmer.new_ktable(k) alphabet={0:'A',1:'T',2:'G',3:'C'} given_string='' for i in range(1000): given_string+=alphabet[random.randint(0,3)] n=kt.consume(given_string) H = HyperLogLog(8) for i in range(n): H.add(given_string[i:i+k],k)
from threading import Thread import time import khmer class ConsumeThread(Thread): def __init__(self, wordsize, genome): self.wordsize = wordsize self.genome = genome Thread.__init__(self) def run(self): self.kt = khmer.consume_genome(self.wordsize, self.genome) genome = open('/tmp/all2.dna').read() nthreads = 2 length = len(genome) genome1 = genome[:length/2] genome2 = genome[length/2:] t1 = ConsumeThread(5, genome1) t2 = ConsumeThread(5, genome2) t1.start() t2.start() t1.join() t2.join() master_kt = khmer.new_ktable(5) master_kt.update(t1.kt) master_kt.update(t2.kt)
ht = khmer.new_counting_hash(K, HT_SIZE, N_HT) unique_kmer = [] for n, record in enumerate(fasta_iter(open(filename))): sequence = record['sequence'] seq_len = len(sequence) for n in range(0,seq_len+1-K): kmer = sequence[n:n+K] if (not ht.get(kmer)): unique_kmer.append(kmer) ht.count(kmer) #for kmer2 in unique_kmer: # print kmer2,ht.get(kmer2) ktable = khmer.new_ktable(K) for n, record in enumerate(fasta_iter(open(filename))): sequence = record['sequence'] seq_len = len(sequence) for n in range(0,seq_len+1-K): kmer = sequence[n:n+K] ktable.count(kmer) false = 0 all_n = 1 for kmer2 in unique_kmer: if ht.get(kmer2) != ktable.get(kmer2): false = false+1 all_n = all_n +1 fp = false*100.0/all_n
import sys sys.path.insert(0, 'build/lib.linux-i686-2.3/') import khmer ktable = khmer.new_ktable(6) ktable.consume("ATGAGAGACACAGGGAGAGACCCAATTAGAGAATTGGACC") for i in range(0, ktable.n_entries()): n = ktable.get(i) if n: print ktable.reverse_hash(i), "is present", n, "time(s)."
import khmer class ConsumeThread(Thread): def __init__(self, wordsize, genome): self.wordsize = wordsize self.genome = genome Thread.__init__(self) def run(self): self.kt = khmer.consume_genome(self.wordsize, self.genome) genome = open('/tmp/all2.dna').read() nthreads = 2 length = len(genome) genome1 = genome[:length / 2] genome2 = genome[length / 2:] t1 = ConsumeThread(5, genome1) t2 = ConsumeThread(5, genome2) t1.start() t2.start() t1.join() t2.join() master_kt = khmer.new_ktable(5) master_kt.update(t1.kt) master_kt.update(t2.kt)