Beispiel #1
0
def test_3_tables():
    x = list(PRIMES_1m)
    x.append(1000005)

    hi = khmer._Countgraph(12, x)

    GG = 'G' * 12  # forward_hash: 11184810
    assert khmer.forward_hash(GG, 12) == 11184810

    collision_1 = 'AAACGTATGACT'
    assert khmer.forward_hash(collision_1, 12) == 184777

    collision_2 = 'AAATACCGAGCG'
    assert khmer.forward_hash(collision_2, 12) == 76603

    collision_3 = 'AAACGTATCGAG'
    assert khmer.forward_hash(collision_3, 12) == 184755

    # hash(GG) % 1000003 == hash(collision_1)
    # hash(GG) % 1009837 == hash(collision_2)
    # hash(GG) % 1000005 == hash(collision_3)
    hi.consume(GG)
    assert hi.get(GG) == 1

    hi.consume(collision_1)
    assert hi.get(GG) == 1

    hi.consume(collision_2)
    assert hi.get(GG) == 1

    hi.consume(collision_3)
    assert hi.get(GG) == 2
def test_3_tables():
    x = list(PRIMES_1m)
    x.append(1000005)

    hi = khmer._new_counting_hash(12, x)

    GG = 'G' * 12                   # forward_hash: 11184810
    assert khmer.forward_hash(GG, 12) == 11184810

    collision_1 = 'AAACGTATGACT'
    assert khmer.forward_hash(collision_1, 12) == 184777L

    collision_2 = 'AAATACCGAGCG'
    assert khmer.forward_hash(collision_2, 12) == 76603L

    collision_3 = 'AAACGTATCGAG'
    assert khmer.forward_hash(collision_3, 12) == 184755L

    # hash(GG) % 1000003 == hash(collision_1)
    # hash(GG) % 1009837 == hash(collision_2)
    # hash(GG) % 1000005 == hash(collision_3)
    hi.consume(GG)
    assert hi.get(GG) == 1

    hi.consume(collision_1)
    assert hi.get(GG) == 1

    hi.consume(collision_2)
    assert hi.get(GG) == 1

    hi.consume(collision_3)
    assert hi.get(GG) == 2
Beispiel #3
0
def test_forward_hash():
    assert khmer.forward_hash('AAAA', 4) == 0
    assert khmer.forward_hash('TTTT', 4) == 0
    assert khmer.forward_hash('CCCC', 4) == 170
    assert khmer.forward_hash('GGGG', 4) == 170

    h = 13607885392109549066
    assert khmer.forward_hash('GGTTGACGGGGCTCAGGGGGCGGCTGACTCCG', 32) == h
Beispiel #4
0
def test_forward_hash():
    assert khmer.forward_hash('AAAA', 4) == 0
    assert khmer.forward_hash('TTTT', 4) == 0
    assert khmer.forward_hash('CCCC', 4) == 170
    assert khmer.forward_hash('GGGG', 4) == 170

    h = 13607885392109549066
    assert khmer.forward_hash('GGTTGACGGGGCTCAGGGGGCGGCTGACTCCG', 32) == h
Beispiel #5
0
def get_neighbors(kmer_hash, K):
   neighbors = []
   kmer = khmer.reverse_hash(kmer_hash, K)

   begin = kmer[0:len(kmer)-1]
   end = kmer[1:len(kmer)]

   for base in bases:
      neighbors.append(khmer.forward_hash(base + begin, K))
      neighbors.append(khmer.forward_hash(end + base, K))

   return set(neighbors)
Beispiel #6
0
def get_neighbors(kmer_hash, K):
    neighbors = []
    kmer = khmer.reverse_hash(kmer_hash, K)

    begin = kmer[0:len(kmer) - 1]
    end = kmer[1:len(kmer)]

    for base in bases:
        neighbors.append(khmer.forward_hash(base + begin, K))
        neighbors.append(khmer.forward_hash(end + base, K))

    return set(neighbors)
Beispiel #7
0
    def test_failed_get(self):
        GG = 'G' * 12                   # forward_hash: 11184810
        GGhash = khmer.forward_hash(GG, 12)
        assert khmer.forward_hash(GG, 12) == 11184810

        hi = self.hi
        hi.consume(GG)

        try:
            hi.get(float(GGhash))
            assert "the previous statement should fail"
        except ValueError as err:
            print(str(err))
Beispiel #8
0
    def test_failed_get(self):
        GG = 'G' * 12  # forward_hash: 11184810
        GGhash = khmer.forward_hash(GG, 12)
        assert khmer.forward_hash(GG, 12) == 11184810

        hi = self.hi
        hi.consume(GG)

        try:
            hi.get(float(GGhash))
            assert "the previous statement should fail"
        except ValueError as err:
            print(str(err))
Beispiel #9
0
def explore(ht, start_kmer, K):
    edges = set()
    discovered = set()
    explored = set()
    hash_ids = {}

    start_kmer_hash = khmer.forward_hash(start_kmer, K)

    if ht.get(khmer.reverse_hash(start_kmer_hash, K)):
        discovered.add(start_kmer_hash)
        hash_ids[start_kmer_hash] = len(hash_ids.keys()) + 1
    else:
        return hash_ids, edges

    while (len(discovered) > 0 and (len(explored) < 2000000)):
        kmer_hash = discovered.pop()
        kmer_neighbors = get_neighbors(kmer_hash, K)

        explored.add(kmer_hash)

        for neigh_hash in kmer_neighbors:
            if ht.get(
                    khmer.reverse_hash(neigh_hash, K)
            ) and neigh_hash not in explored and neigh_hash not in discovered:
                discovered.add(neigh_hash)
                hash_ids[neigh_hash] = len(hash_ids.keys()) + 1
                edges.add(
                    tuple(sorted([hash_ids[neigh_hash], hash_ids[kmer_hash]])))
            elif ht.get(khmer.reverse_hash(
                    neigh_hash, K)) and (neigh_hash in explored
                                         or neigh_hash in discovered):
                edges.add(
                    tuple(sorted([hash_ids[neigh_hash], hash_ids[kmer_hash]])))

    return hash_ids, edges
Beispiel #10
0
def explore(ht, start_kmer, K):
   edges = set()
   discovered = set()
   explored = set()
   hash_ids = {}

   start_kmer_hash = khmer.forward_hash(start_kmer, K)

   if ht.get(khmer.reverse_hash(start_kmer_hash, K)):
      discovered.add(start_kmer_hash)
      hash_ids[start_kmer_hash] = len(hash_ids.keys()) + 1
   else:
      return hash_ids, edges

   while(len(discovered) > 0 and (len(explored) < 2000000)):
      kmer_hash = discovered.pop()
      kmer_neighbors = get_neighbors(kmer_hash, K)

      explored.add(kmer_hash)

      for neigh_hash in kmer_neighbors:
         if ht.get(khmer.reverse_hash(neigh_hash, K)) and neigh_hash not in explored and neigh_hash not in discovered:
            discovered.add(neigh_hash)
            hash_ids[neigh_hash] = len(hash_ids.keys()) + 1
            edges.add(tuple(sorted([hash_ids[neigh_hash], hash_ids[kmer_hash]])))
         elif ht.get(khmer.reverse_hash(neigh_hash, K)) and (neigh_hash in explored or neigh_hash in discovered):
            edges.add(tuple(sorted([hash_ids[neigh_hash], hash_ids[kmer_hash]])))
         
   return hash_ids, edges 
    def test_collision_2(self):

        GG = 'G' * 12                   # forward_hash: 11184810
        assert khmer.forward_hash(GG, 12) == 11184810

        collision_1 = 'AAACGTATGACT'
        assert khmer.forward_hash(collision_1, 12) == 184777L

        collision_2 = 'AAATACCGAGCG'
        assert khmer.forward_hash(collision_2, 12) == 76603L

        # hash(GG) % 1000003 == hash(collision_1)
        # hash(GG) % 1009837 == hash(collision_2)

        hi = self.hi
        hi.consume(GG)
        hi.consume(collision_2)

        assert hi.get(GG) == 1
Beispiel #12
0
    def test_collision_2(self):

        GG = 'G' * 12  # forward_hash: 11184810
        assert khmer.forward_hash(GG, 12) == 11184810

        collision_1 = 'AAACGTATGACT'
        assert khmer.forward_hash(collision_1, 12) == 184777

        collision_2 = 'AAATACCGAGCG'
        assert khmer.forward_hash(collision_2, 12) == 76603

        # hash(GG) % 1000003 == hash(collision_1)
        # hash(GG) % 1009837 == hash(collision_2)

        hi = self.hi
        hi.consume(GG)
        hi.consume(collision_2)

        assert hi.get(GG) == 1
Beispiel #13
0
    def test_collision_1(self):
        kt = khmer.new_ktable(12)
        
        GG = 'G' * 12                   # forward_hash: 11184810
        assert khmer.forward_hash(GG, 12) == 11184810

        collision_1 = 'AAACGTATGACT'
        assert khmer.forward_hash(collision_1, 12) == 184777L

        collision_2 = 'AAATACCGAGCG'
        assert khmer.forward_hash(collision_2, 12) == 76603L

        # note, hash(GG) % 1000003 == hash(collision_1)
        # note, hash(GG) % 1009837 == hash(collision_2)

        hi = self.hi
        hi.consume(GG)
        hi.consume(collision_1)

        assert hi.get(GG) == 1
Beispiel #14
0
    def test_n_occupied_args(self):
        assert self.kh.n_occupied() == 0
        self.kh.consume('AAAA')
        assert self.kh.n_occupied(0, 1) == 1
        assert self.kh.n_occupied(1, 4**4) == 0

        hashvalue = khmer.forward_hash('AACT', 4)
        self.kh.consume('AACT')
        assert self.kh.n_occupied(0, hashvalue + 1) == 2
        assert self.kh.n_occupied(hashvalue + 1, 4**4) == 0
        assert self.kh.n_occupied(hashvalue, hashvalue + 1) == 1
    def test_n_occupied_args(self):
        assert self.kh.n_occupied() == 0
        self.kh.consume('AAAA')
        assert self.kh.n_occupied(0, 1) == 1
        assert self.kh.n_occupied(1, 4 ** 4) == 0

        hashvalue = khmer.forward_hash('AACT', 4)
        self.kh.consume('AACT')
        assert self.kh.n_occupied(0, hashvalue + 1) == 2
        assert self.kh.n_occupied(hashvalue + 1, 4 ** 4) == 0
        assert self.kh.n_occupied(hashvalue, hashvalue + 1) == 1
Beispiel #16
0
def test_kmer_neighbors():
    inpfile = utils.get_test_data('all-A.fa')
    nodegraph = khmer._Nodegraph(4, [3, 5])
    nodegraph.consume_fasta(inpfile)

    h = khmer.forward_hash('AAAA', 4)
    print(type('AAAA'))
    assert nodegraph.neighbors(h) == [0, 0]       # AAAA on both sides
    assert nodegraph.neighbors('AAAA') == [0, 0]  # AAAA on both sides

    h = khmer.forward_hash('AAAT', 4)
    assert nodegraph.neighbors(h) == [0]          # AAAA on one side
    assert nodegraph.neighbors('AAAT') == [0]     # AAAA on one side

    h = khmer.forward_hash('AATA', 4)
    assert nodegraph.neighbors(h) == []           # no neighbors
    assert nodegraph.neighbors('AATA') == []      # AAAA on one side

    h = khmer.forward_hash('TAAA', 4)
    assert nodegraph.neighbors(h) == [0]          # AAAA on both sides
    assert nodegraph.neighbors('TAAA') == [0]     # AAAA on both sides
Beispiel #17
0
def test_kmer_neighbors():
    inpfile = utils.get_test_data('all-A.fa')
    nodegraph = khmer._Nodegraph(4, [3, 5])
    nodegraph.consume_fasta(inpfile)

    h = khmer.forward_hash('AAAA', 4)
    print(type('AAAA'))
    assert nodegraph.neighbors(h) == [0, 0]  # AAAA on both sides
    assert nodegraph.neighbors('AAAA') == [0, 0]  # AAAA on both sides

    h = khmer.forward_hash('AAAT', 4)
    assert nodegraph.neighbors(h) == [0]  # AAAA on one side
    assert nodegraph.neighbors('AAAT') == [0]  # AAAA on one side

    h = khmer.forward_hash('AATA', 4)
    assert nodegraph.neighbors(h) == []  # no neighbors
    assert nodegraph.neighbors('AATA') == []  # AAAA on one side

    h = khmer.forward_hash('TAAA', 4)
    assert nodegraph.neighbors(h) == [0]  # AAAA on both sides
    assert nodegraph.neighbors('TAAA') == [0]  # AAAA on both sides
Beispiel #18
0
def get_all_kmers(ht, start_kmer, K, ht2, degs):
   q = list()

   start_kmer_hash = khmer.forward_hash(start_kmer, K)

   if not ht2.get(start_kmer_hash):
      ht2.count(start_kmer)
   else:
      return ht2, degs

   neighs = find_neighbors(start_kmer, ht)

   degs = add_deg(degs, len(neighs))

   for neigh in neighs:
      neigh_hash = khmer.forward_hash(neigh, K)
      if not ht2.get(neigh):
         q.append(neigh_hash)
         ht2.count(neigh)

   counter = 0

   while len(q) != 0:
      counter += 1

      kmer_hash = q.pop()
      kmer = khmer.reverse_hash(kmer_hash, K)
      neighs = find_neighbors(kmer, ht)

      degs = add_deg(degs, len(neighs))

      for neigh in neighs:
         neigh_hash = khmer.forward_hash(neigh, K)
         if not ht2.get(neigh):
            q.append(neigh_hash)
            ht2.count(neigh)
Beispiel #19
0
def get_all_kmers(ht, start_kmer, K, ht2, degs):
    q = list()

    start_kmer_hash = khmer.forward_hash(start_kmer, K)

    if not ht2.get(start_kmer_hash):
        ht2.count(start_kmer)
    else:
        return ht2, degs

    neighs = find_neighbors(start_kmer, ht)

    degs = add_deg(degs, len(neighs))

    for neigh in neighs:
        neigh_hash = khmer.forward_hash(neigh, K)
        if not ht2.get(neigh):
            q.append(neigh_hash)
            ht2.count(neigh)

    counter = 0

    while len(q) != 0:
        counter += 1

        kmer_hash = q.pop()
        kmer = khmer.reverse_hash(kmer_hash, K)
        neighs = find_neighbors(kmer, ht)

        degs = add_deg(degs, len(neighs))

        for neigh in neighs:
            neigh_hash = khmer.forward_hash(neigh, K)
            if not ht2.get(neigh):
                q.append(neigh_hash)
                ht2.count(neigh)
Beispiel #20
0
def gen_graph(filename, edges, hash_ids, chr, K):
   fd = open(filename, "w")

   fd.write("graph x {\nsize=\"16, 16\";\n")
   fd.write("node [ color = red, fontcolor = black, style = filled ];\n")

   for i in range(len(chr) - K):
      kmer = chr[i:i + K]
      kmer_hash = khmer.forward_hash(kmer, K)
      hash_id = hash_ids[kmer_hash]

      fd.write("N" + str(hash_id) + " [color = black, fontcolor = white];\n")

   for edge in edges:
      fd.write("N" + str(edge[0]) + " -- " + "N" + str(edge[1]) + ";\n")

   fd.write("}")
   fd.close()
Beispiel #21
0
def gen_graph(filename, edges, hash_ids, chr, K):
    fd = open(filename, "w")

    fd.write("graph x {\nsize=\"16, 16\";\n")
    fd.write("node [ color = red, fontcolor = black, style = filled ];\n")

    for i in range(len(chr) - K):
        kmer = chr[i:i + K]
        kmer_hash = khmer.forward_hash(kmer, K)
        hash_id = hash_ids[kmer_hash]

        fd.write("N" + str(hash_id) + " [color = black, fontcolor = white];\n")

    for edge in edges:
        fd.write("N" + str(edge[0]) + " -- " + "N" + str(edge[1]) + ";\n")

    fd.write("}")
    fd.close()
def explore(ht, start_kmer, K):
   discovered = set()
   explored = set()

   start_kmer_hash = khmer.forward_hash(start_kmer, K)
   
   if ht.get(kmer):
      discovered.add(start_kmer_hash)
   else:
      return 0

   while(len(discovered) > 0 and (len(explored) < 2000000)):
      kmer_hash = discovered.pop()
      kmer_neighbors = get_neighbors(kmer_hash, K)

      explored.add(kmer_hash)

      for neigh_hash in kmer_neighbors:
         if ht.get(khmer.reverse_hash(neigh_hash, K)) and neigh_hash not in explored and neigh_hash not in discovered:
            discovered.add(neigh_hash)

   return len(explored)
Beispiel #23
0
def explore(ht, start_kmer, K):
   discovered = set()
   explored = set()

   start_kmer_hash = khmer.forward_hash(start_kmer, K)
   
   if ht.get(kmer):
      discovered.add(start_kmer_hash)
   else:
      return 0

   while(len(discovered) > 0 and (len(explored) < 2000000)):
      kmer_hash = discovered.pop()
      kmer_neighbors = get_neighbors(kmer_hash, K)

      explored.add(kmer_hash)

      for neigh_hash in kmer_neighbors:
         if ht.get(khmer.reverse_hash(neigh_hash, K)) and neigh_hash not in explored and neigh_hash not in discovered:
            discovered.add(neigh_hash)

   return len(explored)
Beispiel #24
0
def test_consume_fasta_and_tag_with_labels():
    lb = GraphLabels(20, 1e7, 4)
    read_1 = 'ACGTAACCGGTTAAACCCGGGTTTAAAACCCCGGGGTTTT'
    filename = utils.get_test_data('test-transcript.fa')

    total_reads, _ = lb.consume_fasta_and_tag_with_labels(filename)
    print("doing get")
    assert lb.graph.get(read_1[:20])
    assert total_reads == 3
    print("doing n_labels")
    print(lb.n_labels())
    print("doing all labels")
    print(lb.get_all_labels())
    print("get tagset")
    for tag in lb.graph.get_tagset():
        print("forward hash")
        print(tag, khmer.forward_hash(tag, 20))
    for record in screed.open(filename):
        print("Sweeping tags")
        print(lb.sweep_tag_neighborhood(record.sequence, 40))
        print("Sweeping labels...")
        print(lb.sweep_label_neighborhood(record.sequence, 40))
    assert lb.n_labels() == 3
Beispiel #25
0
def test_consume_fasta_and_tag_with_labels():
    lb = LabelHash(20, 1e7, 4)
    read_1 = 'ACGTAACCGGTTAAACCCGGGTTTAAAACCCCGGGGTTTT'
    filename = utils.get_test_data('test-transcript.fa')

    total_reads, n_consumed = lb.consume_fasta_and_tag_with_labels(filename)
    print "doing get"
    assert lb.get(read_1[:20])
    assert total_reads == 3
    print "doing n_labels"
    print lb.n_labels()
    print "doing label dict"
    print lb.get_label_dict()
    print "get tagset"
    for tag in lb.get_tagset():
        print "forward hash"
        print tag, khmer.forward_hash(tag, 20)
    for record in screed.open(filename):
        print "Sweeping tags"
        print lb.sweep_tag_neighborhood(record.sequence, 40)
        print "Sweeping labels..."
        print lb.sweep_label_neighborhood(record.sequence, 40)
    assert lb.n_labels() == 3
Beispiel #26
0
def test_consume_seqfile_and_tag_with_labels():
    lb = GraphLabels(20, 1e7, 4)
    read_1 = 'ACGTAACCGGTTAAACCCGGGTTTAAAACCCCGGGGTTTT'
    filename = utils.get_test_data('test-transcript.fa')

    total_reads, _ = lb.consume_seqfile_and_tag_with_labels(filename)
    print("doing get")
    assert lb.graph.get(read_1[:20])
    assert total_reads == 3
    print("doing n_labels")
    print(lb.n_labels())
    print("doing all labels")
    print(lb.get_all_labels())
    print("get tagset")
    for tag in lb.graph.get_tagset():
        print("forward hash")
        print(tag, khmer.forward_hash(tag, 20))
    for record in screed.open(filename):
        print("Sweeping tags")
        print(lb.sweep_tag_neighborhood(record.sequence, 40))
        print("Sweeping labels...")
        print(lb.sweep_label_neighborhood(record.sequence, 40))
    assert lb.n_labels() == 3
Beispiel #27
0
def test_consume_fasta_and_tag_with_labels():
    lb = LabelHash(20, 1e7, 4)
    read_1 = 'ACGTAACCGGTTAAACCCGGGTTTAAAACCCCGGGGTTTT'
    filename = utils.get_test_data('test-transcript.fa')

    total_reads, n_consumed = lb.consume_fasta_and_tag_with_labels(filename)
    print "doing get"
    assert lb.get(read_1[:20])
    assert total_reads == 3
    print "doing n_labels"
    print lb.n_labels()
    print "doing label dict"
    print lb.get_label_dict()
    print "get tagset"
    for tag in lb.get_tagset():
        print "forward hash"
        print tag, khmer.forward_hash(tag, 20)
    for record in screed.open(filename):
        print "Sweeping tags"
        print lb.sweep_tag_neighborhood(record.sequence, 40)
        print "Sweeping labels..."
        print lb.sweep_label_neighborhood(record.sequence, 40)
    assert lb.n_labels() == 3
Beispiel #28
0
            x<<=1
        return v


if __name__ == '__main__':
    
    k=21


    alphabet={0:'A',1:'T',2:'G',3:'C'}

    given_string=''

    for i in range(10000):
        given_string+=alphabet[random.randint(0,3)]
    
    
    H = HyperLogLog(8)

    d=defaultdict(int)

    for i in range(len(given_string)-k+1):
        d[given_string[i:i+k]] += 1
        H.add(khmer.forward_hash(given_string[i:i+k],k))
        
    print 'Real:',len(d)
    print 'HyperLogLog(murmur3):', H.cardinality()
            
            
            
Beispiel #29
0
def test_forward_hash():
    assert khmer.forward_hash('AAAA', 4) == 0
    assert khmer.forward_hash('TTTT', 4) == 0
    assert khmer.forward_hash('CCCC', 4) == 170
    assert khmer.forward_hash('GGGG', 4) == 170
Beispiel #30
0
def test_forward_hash():
    assert khmer.forward_hash("AAAA", 4) == 0
    assert khmer.forward_hash("TTTT", 4) == 0
    assert khmer.forward_hash("CCCC", 4) == 170
    assert khmer.forward_hash("GGGG", 4) == 170
Beispiel #31
0
	def consume(self,sequence):
		num_kmers = len(sequence) - self.k + 1 
		for i in range(num_kmers):
			self.add(khmer.forward_hash(sequence[i:i+self.k],self.k))
Beispiel #32
0
def test_forward_hash():
    assert khmer.forward_hash('AAAA', 4) == 0
    assert khmer.forward_hash('TTTT', 4) == 0
    assert khmer.forward_hash('CCCC', 4) == 170
    assert khmer.forward_hash('GGGG', 4) == 170
Beispiel #33
0
 def add(self, kmer):
     idx = khmer.forward_hash(kmer, self.prefixsize)
     E = self.sketches[idx]
     
     hash = khmer.hash_murmur3(kmer)
     E.add(hash)
Beispiel #34
0
    def add(self, kmer):
        idx = khmer.forward_hash(kmer, self.prefixsize)
        E = self.sketches[idx]

        hash = khmer.hash_murmur3(kmer)
        E.add(hash)