def test_add(self): s = HyperLogLog(0.05) for i in range(10): s.add(str(i)) M = [(i, v) for i, v in enumerate(s.M) if v > 0] self.assertEqual(M, [(1, 1), (41, 1), (44, 1), (76, 3), (103, 4), (182, 1), (442, 2), (464, 5), (497, 1), (506, 1)])
def test_pickle(self): a = HyperLogLog(0.05) for x in range(100): a.add(str(x)) b = pickle.loads(pickle.dumps(a)) self.assertEqual(a.M, b.M) self.assertEqual(a.alpha, b.alpha) self.assertEqual(a.p, b.p) self.assertEqual(a.m, b.m)
def test_calc_cardinality(self): clist = [1, 5, 10, 30, 60, 200, 1000, 10000, 60000] n = 30 rel_err = 0.05 for card in clist: s = 0.0 for c in xrange(n): a = HyperLogLog(rel_err) for i in xrange(card): a.add(os.urandom(20)) s += a.card() z = (float(s) / n - card) / (rel_err * card / math.sqrt(n)) self.assertLess(-1.96, z) self.assertGreater(1.96, z)
def test_calc_cardinality(self): clist = [1, 5, 10, 30, 60, 200, 1000, 10000, 60000] n = 30 rel_err = 0.05 for card in clist: s = 0.0 for c in range(n): a = HyperLogLog(rel_err) for i in range(card): a.add(os.urandom(20)) s += a.card() z = (float(s) / n - card) / (rel_err * card / math.sqrt(n)) self.assertLess(-3, z) self.assertGreater(3, z)
def test_save(self): a = HyperLogLog(0.05) for x in range(100): a.add(str(x)) saved=a.save() b = HyperLogLog(0.05) b.load(saved) self.assertEqual(a.M, b.M) self.assertEqual(a.alpha, b.alpha) self.assertEqual(a.p, b.p) self.assertEqual(a.m, b.m)
def test_update(self): a = HyperLogLog(0.05) b = HyperLogLog(0.05) c = HyperLogLog(0.05) for i in xrange(2): a.add(str(i)) c.add(str(i)) for i in xrange(2, 4): b.add(str(i)) c.add(str(i)) a.update(b) self.assertNotEqual(a, b) self.assertNotEqual(b, c) self.assertEqual(a, c)
def test_update(self): a = HyperLogLog(0.05) b = HyperLogLog(0.05) c = HyperLogLog(0.05) for i in range(2): a.add(str(i)) c.add(str(i)) for i in range(2, 4): b.add(str(i)) c.add(str(i)) a.update(b) self.assertNotEqual(a, b) self.assertNotEqual(b, c) self.assertEqual(a, c)
def test_init(self): s = HyperLogLog(0.05) self.assertEqual(s.p, 9) self.assertEqual(s.alpha, 0.7197831133217303) self.assertEqual(s.m, 512) self.assertEqual(len(s.M), 512)
def test_update_err(self): a = HyperLogLog(0.05) b = HyperLogLog(0.01) self.assertRaises(ValueError, a.update, b)
import khmer import sys from screed.fasta import fasta_iter from hyperloglog.hll import HyperLogLog filename = sys.argv[1] K = int(sys.argv[2]) # size of kmer ERROR_RATE = .01 TT = string.maketrans('ACGT', 'TGCA') hllcpp = khmer.new_hll_counter(ERROR_RATE) hlllib = HyperLogLog(ERROR_RATE) counter = Counter() counter_norc = Counter() for n, record in enumerate(fasta_iter(open(filename))): sequence = record['sequence'] seq_len = len(sequence) for n in range(0, seq_len + 1 - K): kmer = sequence[n:n + K] rc = kmer[::-1].translate(TT) hllcpp.add(kmer) hlllib.add(kmer) counter_norc.update([kmer]) if rc in counter: