def DictIntoDatrie(dictToDo): import datrie ALPHABET = u'-АБВГДЕЁЖЗИЙКЛМНОПРСТУФХЦЧШЩЪЫЬЭЮЯабвгдеёжзийклмнопрстуфхцчшщъыьэюя' directTrie = datrie.BaseTrie(ALPHABET) reverseTrie = datrie.BaseTrie(ALPHABET) for element in dictToDo: directTrie[element] = dictToDo[element]["frequency"] reverseTrie[element[::-1]] = dictToDo[element]["frequency"] # directTrie.save('directTrie.trie') # directTrie.save('reverseTrie.trie') return directTrie, reverseTrie
def _load(self): print("Loading model", self.name, '...', file=sys.stderr, end='') self.model = kenlm.LanguageModel(self.model_file) print(" reading raw ARPA data ... ", file=sys.stderr, end='') self.id2str, self.unigram_probs, bigrams = get_arpa_data(self.arpa_file) self.is_special = np.zeros(len(self.id2str), dtype=bool) for i, word in enumerate(self.id2str): assert self.model.vocab_index(word) == i, i if word[0] not in string.ascii_lowercase: self.is_special[i] = True # Since we give rare-word bonuses, count special words as super-common. self.unigram_probs_wordsonly = self.unigram_probs.copy() self.unigram_probs_wordsonly[self.is_special] = 0 # ... but for finding the most common fallback words, count special words as impossible. unigram_probs_wordsonly_2 = self.unigram_probs.copy() unigram_probs_wordsonly_2[self.is_special] = -np.inf self.most_common_words_by_idx = np.argsort(unigram_probs_wordsonly_2)[-500:] print(" Encoding bigrams to indices... ", file=sys.stderr, end='') self.unfiltered_bigrams, self.filtered_bigrams = encode_bigrams(bigrams, self.model) # Vocab trie self.vocab_trie = datrie.BaseTrie(set(itertools.chain.from_iterable(self.id2str))) for i, s in enumerate(self.id2str): self.vocab_trie[s] = i self.eos_idx = self.model.vocab_index('</S>') self.eop_idx = self.model.vocab_index('</s>') print("Loaded.", file=sys.stderr)
def __init__(self, lang): assert lang in ["en", "he"] self.lang = lang self.normalizer = normalizer(lang) self.token_to_titles = defaultdict(list) self.token_trie = datrie.BaseTrie(letter_scope) self._tfidf_scorer = TfidfScorer()
def make_trie(filename): valid_chars = string.ascii_lowercase + '*' trie = datrie.BaseTrie(valid_chars) with open(filename) as f: for line in f: word = line.strip().decode('utf-8') trie[word] = 0 return trie
def test_save_load_base(): fd, fname = tempfile.mkstemp() trie = datrie.BaseTrie(alphabet=string.printable) trie['foobar'] = 1 trie['foovar'] = 2 trie['baz'] = 3 trie['fo'] = 4 trie.save(fname) trie2 = datrie.BaseTrie.load(fname) assert trie2['foobar'] == 1 assert trie2['baz'] == 3 assert trie2['fo'] == 4 assert trie2['foovar'] == 2
def test_base_trie_data(): trie = datrie.BaseTrie(string.printable) trie['x'] = 1 trie['xo'] = 2 state = datrie.BaseState(trie) state.walk('x') it = datrie.BaseIterator(state) it.next() assert it.data() == 1 state.walk('o') it = datrie.BaseIterator(state) it.next() assert it.data() == 2
def __enter__(self): needCreate = False if not self.db: needCreate = needCreate and not self.dbPath.exists() dbDir = self.dbPath.parent dbDir.mkdir(parents=True, exist_ok=True) self.db = sqlite3.connect(str(self.dbPath)) if not self.isInitialized(): self.initDB() self.dt = self.loadTrie() if self.dt is None: self.dt = datrie.BaseTrie(ranges=[("\0", '\U0010ffff') ]) # whole unicode needCreate = True self.trieWasModified = False if needCreate: self.createDB() self.save() return self
def test_trie_file_io(): fd, fname = tempfile.mkstemp() trie = datrie.BaseTrie(string.printable) trie['foobar'] = 1 trie['foo'] = 2 extra_data = ['foo', 'bar'] with open(fname, "wb", 0) as f: pickle.dump(extra_data, f) trie.write(f) pickle.dump(extra_data, f) with open(fname, "rb", 0) as f: extra_data2 = pickle.load(f) trie2 = datrie.BaseTrie.read(f) extra_data3 = pickle.load(f) assert extra_data2 == extra_data assert extra_data3 == extra_data assert trie2['foobar'] == 1 assert trie2['foo'] == 2 assert len(trie2) == len(trie)
def __get_trie(self): """ Opens and returns the trie if located on backing storage. If the trie does not exist, a new one is created and saved! """ if os.path.exists(self.__vocab_trie_path): print "Loading trie..." return datrie.BaseTrie.load(self.__vocab_trie_path) else: print "Trie not found - creating..." trie = datrie.BaseTrie( string.printable ) # Our acceptable characters - everything in string.printable for input_line in self.__vocab_handle: input_line = input_line.strip().split(',') term = unicode(input_line[0]) frequency = long(input_line[1]) trie[term] = frequency trie.save(self.__vocab_trie_path) return trie
def trie_graph(self, lst): trie = datrie.BaseTrie(string.ascii_uppercase) for l in lst: trie[l] = 0 return trie
] roads = list(roadsDic.keys()) for k in roadsDic.keys(): roads.extend(roadsDic[k]) #地铁 adminAreas.extend(roads) #街道 adminAreas.extend(streets) #楼盘 #adminAreas.extend(estate) adminAreas = list(set(adminAreas)) #tAdmin=datrie.BaseTrie(ranges=[('\u4e00', '\u9fcb'),('\uf900','\ufad6')]) adminWords = set(''.join(adminAreas)) #建立字典 tAdmin = datrie.BaseTrie(list(adminWords)) tPrice = datrie.BaseTrie("一二两三四五六七八九十百千1234567890") tTag = datrie.BaseTrie(set(''.join(tags))) for i, aA in enumerate(adminAreas): tAdmin[aA] = i for r in prices: # 键值对应字符串和整数 if ord(r[0]) > 60: tPrice[r] = priceDic[r] else: tPrice[r] = int(r) for t in tags: tTag[t] = 0 data = {'admin': tAdmin, 'price': tPrice, 'tag': tTag}
import time import timeit import itertools import text_example import memory_profiler import datrie if __name__ == "__main__": print "RAM at start {:0.1f}MiB".format(memory_profiler.memory_usage()[0]) # avoid building a temporary list of words in Python, store directly in the # Trie t0 = time.time() chars = set() for word in text_example.readers: chars.update(word) trie = datrie.BaseTrie(chars) t1 = time.time() print "Created a trie with a dictionary of {} characters in {:0.1f}s".format( len(chars), t1 - t0) readers = text_example.read_words(text_example.SUMMARISED_FILE) for word in readers: trie[word] = 0 t2 = time.time() print "RAM after creating trie {:0.1f}MiB, took {:0.1f}s".format( memory_profiler.memory_usage()[0], t2 - t1) print "The trie contains {} words".format(len(trie)) assert u'Zwiebel' in trie time_cost = sum( timeit.repeat(stmt="u'Zwiebel' in trie",
def build_trie(dict_file='/usr/share/dict/words'): trie = datrie.BaseTrie(string.ascii_lowercase) with open(dict_file, 'r') as words: filter_dict_into_trie(words, trie) return trie
def __init__(self): ALPHABET = u'abcdefghijklmnopqrstuvwxyz0123456789()&-., ' self.trie = datrie.BaseTrie(ALPHABET)