def testTrieDiff(self): trie1 = trie.Node() trie2 = trie.Node() accept1 = trie.AcceptInfo(input_rr='%eax', output_rr='%edx') accept2 = trie.AcceptInfo(input_rr='%eax', output_rr='%ecx') trie.AddToUncompressedTrie(trie1, ['0', '1', '2'], accept1) trie.AddToUncompressedTrie(trie1, ['0', '1', '3'], accept1) trie.AddToUncompressedTrie(trie1, ['0', '1', '4'], accept1) trie.AddToUncompressedTrie(trie1, ['0', '1', '5'], accept1) trie.AddToUncompressedTrie(trie2, ['0', '1', '2'], accept1) trie.AddToUncompressedTrie(trie2, ['0', '1', '3'], accept1) trie.AddToUncompressedTrie(trie2, ['0', '1', '4'], accept2) node_cache = trie.NodeCache() compressed_trie1 = node_cache.Merge(node_cache.empty_node, trie1) compressed_trie2 = node_cache.Merge(node_cache.empty_node, trie2) diffs = set() compressed_diffs = set() for diff in trie.DiffTries(trie1, trie2, node_cache.empty_node, ()): diffs.add(diff) for diff in trie.DiffTries(compressed_trie1, compressed_trie2, node_cache.empty_node, ()): compressed_diffs.add(diff) self.assertEquals( diffs, set([(('0', '1', '4'), accept1, accept2), (('0', '1', '5'), accept1, None)])) self.assertEquals(diffs, compressed_diffs)
def _build_trie(self): self.root = trie.Node() for t, m in zip(self.T, self.multiplicities): items = sorted(t, key=lambda item: self.sort_order[item], reverse=True) items = [item for item in items if item in self.frequent_items] self.root.insert(items, m)
def MakeUncompressedTrie(self): uncompressed = trie.Node() accept = trie.AcceptInfo(input_rr='%eax', output_rr='%edx') trie.AddToUncompressedTrie(uncompressed, ['0', '1', '2'], accept) trie.AddToUncompressedTrie(uncompressed, ['0', '1', '2', '3'], accept) trie.AddToUncompressedTrie(uncompressed, ['0', '1', '3'], accept) trie.AddToUncompressedTrie(uncompressed, ['0', '1', '4'], accept) trie.AddToUncompressedTrie(uncompressed, ['0', '1', '5'], accept) return uncompressed
def autocomplete_load_node(self): tmp = self.languages_in.currentText().lower() dill.dump(self.node, open(f"autocomplete/{self.actual_lang}.pickle", "wb")) self.actual_lang = tmp if os.path.isfile(f"./autocomplete/{self.actual_lang}.pickle"): self.node = dill.load(open(f"autocomplete/{self.actual_lang}.pickle", "rb")) else: node = trie.Node() dill.dump(node, open(f"autocomplete/{self.actual_lang}.pickle", "wb")) self.node = node
def build_trie(state_feat_dic): """ build trie nodes with state-features dictionary :param state_feat_dic: state-features dictionary :return: root node of trie """ trie_root = trie.Node() for num, state_feat in enumerate(sorted(state_feat_dic.keys()), start=1): if num % 1000000 == 0: logging.info('%dm-th trie node inserting..', num / 1000000) trie_root.insert(state_feat, state_feat_dic[state_feat]) return trie_root
def main(fin, output_stem): """ make syllable-morpheme TRIE dictionary :param fin: input file :param output_stem: output file name without extension """ syll_morph_dic = defaultdict(set) for line_num, line in enumerate(fin, start=1): if line_num % 1000000 == 0: logging.info('%dm-th line', (line_num / 1000000)) line = line.strip() if not line: continue syllable, morph = unicode(line, 'UTF-8').split(u'\t', 1) if _ANAL_RESULT_DELIM in morph or _MORPH_DELIM in morph: raise RuntimeError('Delimiter in morpheme results') else: morph = morph.replace(u'\t', _ANAL_RESULT_DELIM).replace( u' + ', _MORPH_DELIM) syll_morph_dic[syllable].add(morph) trie_root = trie.Node() for syllable in sorted(syll_morph_dic.keys()): morphs = sorted(list(syll_morph_dic[syllable])) trie_root.insert(syllable, _ANAL_RESULT_DELIM.join(morphs)) fout_key = open('%s.trie' % output_stem, 'wb') fout_val = open('%s.val' % output_stem, 'w') fout_val_idx = open('%s.val.len' % output_stem, 'wb') val_serial = 0 nodes = trie_root.breadth_first_traverse() for idx, node in enumerate(nodes): logging.debug(u'%d:%s', idx, node) val_idx = -1 if node.value: val_idx = val_serial val_serial += 1 uni_val = (node.value + u'\0').encode('UTF-32LE') fout_val.write(uni_val) fout_val_idx.write(struct.pack( 'h', len(uni_val) / 4)) # length include terminating zero value fout_key.write(node.pack(val_idx)) logging.info('Number of nodes: %d', len(nodes)) logging.info('Number of values: %d', val_serial)
contents_temp = contents titles_temp = titles for i in range(NN): for j in range(len(contents[i])): contents[i][j] = unidecode.unidecode(contents[i][j]) for j in range(len(titles[i])): titles[i][j] = unidecode.unidecode(titles[i][j]) getReference = {} documentRoot = [] collection = trie.CollectionNode() for i in range(NN): newDocument = trie.Node() documentRoot.append(newDocument) getReference[get_docID[i]] = newDocument max_tf = {} start = time.time() for i in tqdm(range(NN)): for w in contents_temp[i]: collection.add_document(w, 0, get_docID[i]) documentRoot[i].add(w, 0) if get_docID[i] in max_tf: max_tf[get_docID[i]] = max(documentRoot[i].count_words(w, 0), max_tf[get_docID[i]]) else: max_tf[get_docID[i]] = documentRoot[i].count_words(w, 0) for w in titles_temp[i]:
import trie import csv word_list = [] full_name_root = trie.Node() middle_name_root = trie.Node() last_name_root = trie.Node() with open('../data/test_data_sample.csv', 'r') as csvFile: reader = csv.reader(csvFile) counter = 0 for w in reader: full_name = "" word_list.append(w) #print("Added : " + w[0] + "Index in list : " + str(counter)) #first_name_root.add_word(w[0].lower(),index_in_list=counter) full_name += w[0].lower() if len(w) > 1: middle_name_root.add_word(w[1].lower(), index_in_list=counter) full_name += w[1].lower() if len(w) > 2: last_name_root.add_word(w[2].lower(), index_in_list=counter) full_name += w[2].lower() full_name_root.add_word(full_name, index_in_list=counter) counter += 1 def getName(index): name = "" l = len(word_list[index]) for i in range(0, l):
def __init__(self, validator): self.total_instructions = 0 self.num_valid = 0 self.validator = validator self.sub_trie = trie.Node() self.node_cache = trie.NodeCache()
import trie import sqlite3 word_list = [] full_name_root = trie.Node() sqlite3.connect('../data/abc.db') con = sqlite3.connect('../data/abc.db') # database file input cur = con.cursor() query = "select username from user order by username" cur.execute(query) result = cur.fetchall() counter = 0 for w in result: # word_list.append(r[0]) full_name = "" word_list.append(w) # print("Added : " + w[0] + "Index in list : " + str(counter)) #first_name_root.add_word(w[0].lower(),index_in_list=counter) full_name += w[0].lower() # if len(w) > 1: # middle_name_root.add_word(w[1].lower(),index_in_list=counter) # full_name += w[1].lower() # if len(w) > 2: # last_name_root.add_word(w[2].lower(),index_in_list=counter) # full_name += w[2].lower() full_name_root.add_word(full_name, index_in_list=counter)