def load_as_hattrie(filepath): with open(filepath, 'r') as f: data = hat_trie.Trie() for x in f.readlines(): data[x.rstrip()] = 0 return data
def test_leak(): import sys values = list(string.ascii_lowercase) # Using "list(map())" to avoid the list comprehension variable # which increases the reference count. counts = list(map(sys.getrefcount, values)) trie = hat_trie.Trie() for v in values: trie['foo'] = v # Python's for loop variables leak scope into the function body del v count = sys.getrefcount(trie['foo']) for i in range(10): current_count = sys.getrefcount(trie['foo']) assert current_count == count count0 = sys.getrefcount(values[0]) count_last = sys.getrefcount(values[-1]) assert count0 == counts[0] assert count_last == counts[-1] + 1 del trie after = list(map(sys.getrefcount, values)) assert after == counts
def test_iterkeys(): trie = hat_trie.Trie() non_ascii_key = 'вася' trie[non_ascii_key] = 20 assert next(trie.iterkeys()) == non_ascii_key
def test_get(): trie = hat_trie.Trie() assert trie.get('foo') is None assert trie.get('bar') is None assert trie.get('foo', 5) == 5 trie['foo'] = 5 trie['bar'] = 10 assert trie.get('foo') == 5 assert trie.get('bar') == 10
def test_get_set(): trie = hat_trie.Trie() trie['foo'] = 5 trie['bar'] = 10 assert trie['foo'] == 5 assert trie['bar'] == 10 with pytest.raises(KeyError): trie['f'] with pytest.raises(KeyError): trie['foob'] with pytest.raises(KeyError): trie['x'] non_ascii_key = 'вася' trie[non_ascii_key] = 20 assert trie[non_ascii_key] == 20
def test_getitem_set(): trie = hat_trie.Trie() trie['foo'] = 5 trie['bar'] = 'asdf' trie['baz'] = (10, 'quuz') assert trie['foo'] == 5 assert trie['bar'] == 'asdf' assert trie['baz'] == (10, 'quuz') with pytest.raises(KeyError): trie['f'] with pytest.raises(KeyError): trie['foob'] with pytest.raises(KeyError): trie['x'] non_ascii_key = 'вася' trie[non_ascii_key] = 20 assert trie[non_ascii_key] == 20
def test_get_set_fuzzy(): russian = 'абвгдеёжзиклмнопрстуфхцчъыьэюя' alphabet = russian.upper() + string.ascii_lowercase words = list( set([ "".join([ random.choice(alphabet) for x in range(random.randint(2, 10)) ]) for y in range(20000) ])) trie = hat_trie.Trie() enumerated_words = list(enumerate(words)) for index, word in enumerated_words: trie[word] = index random.shuffle(enumerated_words) for index, word in enumerated_words: assert word in trie, word assert trie[word] == index, (word, index) assert sorted(trie.keys()) == sorted(words)
def create_trie(): words = words100k() trie = hat_trie.Trie() for word in words: trie[word] = 1 return trie
import time import timeit import text_example import memory_profiler import hat_trie if __name__ == "__main__": print("RAM at start {:0.1f}MiB".format(memory_profiler.memory_usage()[0])) # avoid building a temporary list of words in Python, store directly in the # Trie t1 = time.time() words_trie = hat_trie.Trie() for word in text_example.readers: words_trie[word] = 0 t2 = time.time() print("RAM after creating trie {:0.1f}MiB, took {:0.1f}s".format(memory_profiler.memory_usage()[0], t2 - t1)) print("The trie contains {} words".format(len(words_trie))) assert 'Zwiebel' in words_trie time_cost = sum(timeit.repeat(stmt="u'Zwiebel' in words_trie", setup="from __main__ import words_trie", number=1, repeat=10000)) print("Summed time to lookup word {:0.4f}s".format(time_cost))
def test_contains(): trie = hat_trie.Trie() assert 'foo' not in trie trie['foo'] = 5 assert 'foo' in trie assert 'f' not in trie