def getCommonPrefix(fileSet, trieNode, separator='-_'): debug = False # debug = ('killswitch engage - temple from the within.mp3' in fileSet) root = Trie('$$') for f in fileSet: root.insert(list(trieNode.children[f].key)) prefixList = [] def dfs(trieNode, curStr=''): if debug: print curStr, trieNode.num_successors, int(0.8*len(fileSet)) if (trieNode.num_successors >= int(0.8*len(fileSet)) and trieNode.num_successors > 1) or trieNode.num_successors >= 3: res = False for k in trieNode.children: res = (dfs(trieNode.children[k], curStr+k) or res) if res: return True elif trieNode.key in separator: prefixList.append((curStr, trieNode.num_successors)) return True else: return False else: return False dfs(root) # if len(prefixList) > 0: # print prefixList, "->\n", '\n\t'.join(fileSet) return prefixList
def to_dict(self): state = self.state.to_dict(True) nstate = {} for s in state: t = Trie(STATEDB_DIR, state[s][STORAGE_INDEX]) o = [0] * ACCT_RLP_LENGTH o[NONCE_INDEX] = decode_int(state[s][NONCE_INDEX]) o[BALANCE_INDEX] = decode_int(state[s][BALANCE_INDEX]) o[CODE_INDEX] = state[s][CODE_INDEX] td = t.to_dict(True) o[STORAGE_INDEX] = {decode_int(k): decode_int(td[k]) for k in td} nstate[s.encode('hex')] = o return { "number": self.number, "prevhash": self.prevhash, "uncles_root": self.uncles_root, "coinbase": self.coinbase, "state": nstate, "transactions_root": self.transactions_root, "difficulty": self.difficulty, "timestamp": self.timestamp, "extradata": self.extradata, "nonce": self.nonce }
def create_data_structure(filepath): """Open file and load wordlist into Trie ds""" ds = Trie() with open(filepath, 'r') as fp: for word in fp: ds.add_word(word.rstrip()) return ds
def find_compound_words(words): """ trie + BFS + pruning Advantages of trie: 1. Predictable O(k) lookup time where k is the size of the key. 2. We can easily get all prefixes of a given word. Drawbacks of tries: 1. Space-consuming, it is a trade-off between time-complexity and space\ complexity. We can use radix-tree to get optimized space, but in \ practice, it doesn't have a reasonable improvement and it takes more\ time than trie. """ compound_words = set([]) trie = Trie() queue = collections.deque() prefixes_dict = {} for word in words: prefixes = trie.has_prefixes(word) for prefix in prefixes: queue.append((word, word[len(prefix) :])) trie.insert(word) while queue: word, suffix = queue.popleft() # pruning if word in compound_words: continue # find a compund word if suffix in trie: compound_words.add(word) else: prefixes = trie.has_prefixes(suffix) for prefix in prefixes: queue.append((word, suffix[len(prefix) :])) return compound_words
def test_contains_given_string_one_string_(): """Test that a given string is in the list.""" from trie import Trie trie = Trie() token = 'pig' trie.insert(token) assert trie.contains(token)
def car_trie(): """Filled trie.""" from trie import Trie t = Trie() for word in WORDS: t.insert(word) return t
def test_finds_nodes(self): t1, t2, t3, t4 = Trie(), Trie(), Trie(), Trie() t1.children['b'] = t2 t2.children['a'] = t3 t3.children['r'] = t4 trie = t1.find('bar') assert trie == t4
def test_contains_on_partial(): """Test contains returns false on partial match.""" from trie import Trie trie = Trie() token = 'piglet' trie.insert(token) assert trie.contains('pig') is False
def test_insert_one_token(): """Test when token is inserted into the trie correctly.""" from trie import Trie trie = Trie() token = 'pig' trie.insert(token) assert trie.container == {'p': {'i': {'g': {'$': '$'}}}}
def test_contatins(): """Test contains responds with true for a word that has been inserted.""" from trie import Trie t = Trie() t.insert("cat") result = t.contains("cat") assert result is True
def test_contains_on_shorter(): """Test contains returns false on non-inserted longer word.""" from trie import Trie trie = Trie() token = 'pig' trie.insert(token) assert trie.contains('piglet') is False
def find_top_k_with_trie(k = 10): """ Too slow and large memory consuming. time consuming: 147.656000137 (164, 'mh') (164, 'sq') (165, 'bi') (165, 'mo') (167, 'im') (168, 'ux') (169, 'br') (169, 'gj') (170, 'ij') (171, 'qd') """ result = [] t = Trie() # trie with open(TDATA) as f: for line in f: t.insert(line.strip()) # heapq for n in t.ipreorder(t.root): if len(result) < k: heapq.heappush(result, n) else: heapq.heappushpop(result, n) return result
def test_contatins_false(): """Test contains responds with false for a word that is not inserted.""" from trie import Trie t = Trie() t.insert("cat") result = t.contains("dog") assert result is False
class DictionaryTest(unittest.TestCase): def setUp(self): self.unigrams = Trie() self.unigrams['a'] = 200 self.unigrams['hi'] = 130 self.unigrams['hello'] = 120 self.unigrams['there'] = 140 self.unigrams['how'] = 150 self.unigrams['are'] = 80 self.unigrams['you'] = 200 self.unigrams['your'] = 100 self.ngrams = Trie() self.ngrams[['hello','there']] = 20 self.ngrams[['hello','you']] = 25 self.ngrams[['how','are','you']] = 80 self.ngrams[['you','are','there']] = 30 self.ngrams[['are','you','there']] = 60 self.bindict = BinaryDictionary() self.bindict.encode_unigrams(self.unigrams) self.bindict.encode_ngrams(self.ngrams) def test_trie_weight(self): self.assertEqual(self.unigrams['hello'], 120) self.assertEqual(self.ngrams[['hello','there']], 20) def test_trie_key_error(self): with self.assertRaises(KeyError): self.ngrams['hello'] def test_trie_unigram_predict(self): self.assertTrue('e' in map(itemgetter(0), self.unigrams.get_predictions(['h']))) self.assertEquals('l', self.unigrams.get_predictions(list('he'))[0][0]) self.assertEquals(len(self.unigrams.get_predictions(list('hello'))), 0) def test_trie_ngram_predict(self): self.assertTrue('there' in map(itemgetter(0), self.ngrams.get_predictions(['hello']))) self.assertTrue('you' in map(itemgetter(0), self.ngrams.get_predictions(['how','are']))) def test_bindict_exists(self): self.assertTrue(self.bindict.exists('hello')) self.assertTrue(not self.bindict.exists('hellos')) self.assertTrue(not self.bindict.exists('h')) self.assertTrue(self.bindict.exists('a')) def test_bindict_ngram_predict(self): self.assertTrue('there' in map(itemgetter(0), self.bindict.get_predictions(['hello']))) self.assertTrue('you' in map(itemgetter(0), self.bindict.get_predictions(['how','are']))) def test_correct(self): self.assertTrue('you' in self.bindict.get_corrections('yuu').keys()) self.assertTrue('your' in self.bindict.get_corrections('yuur').keys()) def test_completions(self): self.assertTrue('you' in self.bindict.get_completions('yo', 1)) self.assertFalse('your' in self.bindict.get_completions('yo', 1)) self.assertTrue('your' in self.bindict.get_completions('yo', 2)) self.assertFalse('yo' in self.bindict.get_completions('y', 1))
def test_traversals_on_empty(): """Test traversal for aan empty.""" from trie import Trie t = Trie() result = [] for item in t.traversal(t.root): result.append(item) assert result == []
def test_traversal_no_words(): """Test traversal on trie with no words.""" from trie import Trie word_list = [] trie = Trie() for word in trie.traversal(start=trie.container): word_list.append(word) assert word_list == []
def test_overlapping_words(): from trie import Trie new_trie = Trie() new_trie.insert('words') new_trie.insert('trie') new_trie.insert('trip') assert new_trie.root == {'w': {'o': {'r': {'d': {'s': {'$': '$'}}}}}, 't': {'r': {'i': {'e': {'$': '$'}, 'p': {'$': '$'}}}}}
def readDict(): #get words from /usr/share/dict/words f = open('/usr/share/dict/words', 'r') words = Trie() #load words into trie for word in f.read().split('\n'): words.insert(word) return words
def trie_dictionary(dictionary): trie = Trie() for key in dictionary.keys(): #print key trie.add(key, dictionary[key]) return trie
def test_2_trie(): """Test that we can insert two words into the Trie.""" from trie import Trie new_trie = Trie() new_trie.insert('words') new_trie.insert('trie') assert new_trie.root == {'w': {'o': {'r': {'d': {'s': {'$': '$'}}}}}, 't': {'r': {'i': {'e': {'$': '$'}}}}}
def test_search_7(self): trie = Trie() key = "tea" trie.insert( key ) matches = trie.search( "pea", 1 ) self.assertEqual(len(matches), 1) self.assertTrue(key in matches)
def test_insert_fork(self): trie = Trie() ab_data = 'ripper X!' ax_data = 'for chopping' trie.insert( 'ab', ab_data ) trie.insert( 'ax', ax_data ) self.assertIsNone(trie.get('a')) self.assertEqual(trie.get('ab'), ab_data) self.assertEqual(trie.get('ax'), ax_data)
class Pruner(object): def __init__(self, **kwargs): self.file_a = kwargs.get('file_a') self.file_b = kwargs.get('file_b') self.file_c = kwargs.get('file_c') self.file_d = kwargs.get('file_d', 'junk_out.csv') # self.number_of_lines = kwargs.get('number_of_lines', 800000) self.trie = Trie() def prune(self): input_file_a = open(self.file_a, 'rU') data = csv.reader((line.replace('\0', '') for line in input_file_a), delimiter=",") i = 0 j = 0 for line in data: i += 1 if (len(line) < 1): j += 1 else: line = self.parse_string(line[0]) self.trie.insert_trie(line) print 'Total Number of lines inserted in input'+str(i) print 'Total Number of lines skipped while inserting'+str(j) input_file_a.close() input_file_b = open(self.file_b, 'rU') data = csv.reader((line.replace('\0', '') for line in input_file_b), delimiter=",") output_file_c = open(self.file_c, 'w') output_file_d = open(self.file_d, 'w') csv_writer_file_c = csv.writer(output_file_c, delimiter=",") csv_writer_file_d = csv.writer(output_file_d, delimiter=",") j = 0 k = 0 for line in data: if (len(line) < 1): continue line = self.parse_string(line[0]) if self.trie.in_trie(line): j += 1 csv_writer_file_d.writerow(line) else: k += 1 csv_writer_file_c.writerow(line) print str(j) + ' in junk' print str(k) + ' in output' output_file_c.close() output_file_d.close() print 'Done' def parse_string(self, input_str=''): return str(input_str)
def test_auto_complete_nonexist(): """Test autocomplete for a word that does not exist in trie.""" from trie import Trie t = Trie() t.insert("catty") t.insert("church") t.insert("crutch") t.insert("cats") t.insert("dog") assert t.autocomplete("p") == []
def test_08_find_all(self): t = Trie() l = ["spam", "spammer", "spamhouse", "spammers", "spams", "bacon"] for i in l: t[i] = i self.assertEqual(t.find_all("bac"), ["bacon"]) self.assertEqual(sorted(t.find_all("spam")), sorted(l[:-1])) self.assertEqual(sorted(t.find_all("")), sorted(l))
def test_search_5(self): trie = Trie() key_ace = "ace" key_ate = "ate" trie.insert( key_ace ) trie.insert( key_ate ) matches = trie.search( "axe", 1 ) self.assertEqual(len(matches), 2) self.assertTrue(key_ace in matches) self.assertTrue(key_ate in matches)
def test_search_8(self): trie = Trie() key_tea = "tea" key_pet = "pet" trie.insert( key_tea ) trie.insert( key_pet ) matches = trie.search( "pea", 1 ) self.assertEqual(len(matches), 2) self.assertTrue(key_tea in matches) self.assertTrue(key_pet in matches)
def test_match_prefix_0(self): trie = Trie() matches = trie.match_prefix('a') self.assertEqual(len(matches), 0) matches = trie.match_prefix('z') self.assertEqual(len(matches), 0) matches = trie.match_prefix('abracadabra') self.assertEqual(len(matches), 0)
def test_search_2(self): trie = Trie() key_at = "at" key_as = "as" trie.insert( key_at ) trie.insert( key_as ) matches = trie.search( "ax", 1 ) self.assertEqual(len(matches), 2) self.assertTrue(key_at in matches) self.assertTrue(key_as in matches)
def test_search_6(self): trie = Trie() key = "ate" trie.insert( key ) matches = trie.search( "any", 2 ) self.assertEqual(len(matches), 1) self.assertTrue(key in matches) matches = trie.search( "any", 1 ) self.assertEqual(len(matches), 0)
class TestTrie(unittest.TestCase): def setUp(self): self.t = Trie() def add_words(self): self.assertTrue(self.t.add_word("AICIXE")) self.assertTrue(self.t.add_word("AMBKP")) def test_words(self): self.t.add_word("AICIXE") self.t.add_word("AMBKP") self.assertListEqual(self.t.list_words('')[1], ["AICIXE", "AMBKP"]) self.assertListEqual(self.t.list_words('AI')[1], ["AICIXE"])
def test_SelfAdd(self): self.trie["Foo"] = True t2 = Trie() t2["Food"] = True self.assertTrue("Foo" in self.trie) self.assertFalse("Food" in self.trie) self.assertTrue("Food" in t2) self.assertFalse("Foo" in t2) self.trie += t2 self.assertTrue("Foo" in self.trie) self.assertTrue("Food" in self.trie)
def build(self, n:int=1) -> Trie: trie = Trie() for i in range(len(self.seq) + 1 - n): tokens = self.seq[i:i+n] if '</s>' in tokens[:-1]: continue if tokens not in trie: trie[tokens] = 0 trie[tokens] += 1 return trie
def test_Addition(self): self.trie["Foo"] = True t2 = Trie() t2["Food"] = True t3 = t2 + self.trie self.assertTrue("Foo" in self.trie) self.assertFalse("Food" in self.trie) self.assertTrue("Food" in t2) self.assertFalse("Foo" in t2) self.assertTrue("Foo" in t3) self.assertTrue("Food" in t3)
def findWords(self, board: List[List[str]], words: List[str]) -> List[str]: def rec(r, c, node): ch = board[r][c] if ch not in node.children: return node = node.children[ch] if node.key: #res.add(word) # if using set() for output res.append(node.key) node.key = "" # alternative to using set() for board[r][c] = '#' if r >= 1 and board[r - 1][c] != '#': rec(r - 1, c, node) if r + 1 < m and board[r + 1][c] != '#': rec(r + 1, c, node) if c >= 1 and board[r][c - 1] != '#': rec(r, c - 1, node) if c + 1 < n and board[r][c + 1] != '#': rec(r, c + 1, node) board[r][c] = ch trie = Trie() for w in words: trie.insert(w) m = len(board) n = len(board[0]) #res = set() res = [] for i in range(m): for j in range(n): rec(i, j, trie.root) #return list(res) return res
def findWords(self, board: List[List[str]], words: List[str]) -> List[str]: def rec(r, c, word=""): word += board[r][c] node = trie.find_node(word) if not node: return if node.is_key: #res.add(word) # if using set() for output res.append(word) node.is_key = False # alternative to using set() for board[r][c] = '#' if r >= 1 and board[r - 1][c] != '#': rec(r - 1, c, word) if r + 1 < m and board[r + 1][c] != '#': rec(r + 1, c, word) if c >= 1 and board[r][c - 1] != '#': rec(r, c - 1, word) if c + 1 < n and board[r][c + 1] != '#': rec(r, c + 1, word) board[r][c] = word[-1] trie = Trie() for w in words: trie.insert(w) m = len(board) n = len(board[0]) #res = set() res = [] for i in range(m): for j in range(n): rec(i, j) #return list(res) return res
def __init__(self, header, transactions=None, uncles=None, db=None): if transactions is None: transactions = [] if uncles is None: uncles = [] self.db = db if self.db is None: raise TypeError("Block must have a db") super(FrontierBlock, self).__init__( header=header, transactions=transactions, uncles=uncles, ) self.state_db = State(self.db, root_hash=self.header.state_root) self.transaction_db = Trie(self.db, root_hash=self.header.transaction_root) self.receipt_db = Trie(self.db, root_hash=self.header.receipts_root)
def __init__(self): self.CHARS_MAPPING = { "a": ("a", "@", "*", "4"), "i": ("i", "*", "l", "1"), "o": ("o", "*", "0", "@"), "u": ("u", "*", "v"), "v": ("v", "*", "u"), "l": ("l", "1"), "e": ("e", "*", "3"), "s": ("s", "$", "5"), "t": ("t", "7") } self.censor_urls = set() self.profane_trie = Trie() self.default_wordlist_filename = get_complete_path( 'data/profanity_wordlist.txt') self.default_urls_filename = get_complete_path( 'data/profane_sites.txt') self.load_profane_words(profane_words=None, whitelist_words=None) self.load_profane_urls()
def read_volunteers(): """ Read all the volunters in and orchestrate their transformation """ group = None # Will hold Volunteer Objects user_trie = Trie() # Will contain complete slack user list in JSON with open("volunteers.csv") as volunteers: reader = csv.reader(volunteers) group = [Volunteer(line) for line in reader] group.pop(0) with open("config.yaml") as config: reader = yaml.load(config) user_list = get_users_slack(reader["slack"]) for user in user_list: if 'real_name' not in user: continue user_trie.add(user['real_name'].lower(), user) md_file = open('./volunteers.md', 'w') for gr in group: gr.parse_slack(user_trie) md_file.write(str(gr)) md_file.close()
def main(): csv_filename = r"./Data/load_employees_dump.txt" emp_trie = Trie() emp_dict = {} with open(csv_filename, "r") as fd: dict_reader = csv.DictReader(fd, [ 'employee_id', 'birth_date', 'first_name', 'last_name', 'gender', 'joining_date', 'manager_id' ]) for record in dict_reader: emp_id = record['employee_id'] name = record['first_name'] + '.' + record['last_name'] mgr_id = record['manager_id'] emp_dict[emp_id] = (emp_id, name, mgr_id) emp_trie.insert(name, emp_id) status, emp_id = emp_trie.search('Cristinel.Bouloucos') print(emp_dict[emp_id]) emp_id, name, mgr_id = emp_dict[emp_id] print('Employee Name: {} and Manager Name: {}'.format( name, emp_dict[mgr_id][1]))
def get(conn: socket, request: str, trie: Trie): get_request = request key = get_request[len(GET):].strip() value = trie.get(key=key) if not value: not_found_response(s=conn, request=request) return # value = json.dumps(value).replace(",", ";") log(code=SUCCESS, request=request) construct_response(s=conn, data={key: value}, success=True)
def test_06_trie_setdefault(self): t = Trie() t.setdefault("spam", []).append("eggs") self.assertEqual(t["spam"], ["eggs"]) t.setdefault("spam", []).append("coffee") self.assertEqual(t["spam"], ["eggs", "coffee"]) self.assertEqual(t.setdefault("spa", "bacon"), "bacon") self.assertEqual(t["spa"], "bacon")
def __init__(self): # TODO 测试集上检查平滑处理的抉择问题 self.minfreq = -3.14e+100 # 构建字典树、用于扫描全切分有向图 self.trie = Trie() self.construct_trie() # 构建 二元词典 # self.construct_bigram_dic() # 读取二元词典 with open('files/bigram_dic.json', 'r') as f: self.bigram_dic = json.load(f) # 进行特殊处理 self.SP = SpecialProcess() # 创建HMM分词模型 self.hmm = HMM() # 获取常用姓名中名字 self.get_second_names() self.get_first_name()
def func(): # 等待tags和alias加载完毕 while not tags or not all_alias: time.sleep(0.1) cmd_trie, tag_trie = Trie(), Trie() cmd_trie.add(cfg.cmds) cmd_trie.add(all_alias['cmd'].keys()) tag_trie.add(tags.keys()) global tries tries = {'cmd': cmd_trie, 'tag': tag_trie}
def __init__(self, root=b'', env=Env(), executing_on_head=False, **kwargs): self.env = env self.trie = SecureTrie(Trie(RefcountDB(self.db), root)) self.txindex = STATE_DEFAULTS['txindex'] self.block_number = STATE_DEFAULTS['block_number'] self.block_coinbase = STATE_DEFAULTS['block_coinbase'] self.timestamp = STATE_DEFAULTS['timestamp'] self.prev_headers = STATE_DEFAULTS['prev_headers'] self.journal = [] self.cache = {} self.changed = {} self.executing_on_head = executing_on_head
def delete_all(secret_key): if secret_key == '8': # secret key is correct, delete all from db RepoStrings.query.delete() db.session.commit() set_trie(Trie(), reset=True) return '', 204 return jsonify({ 'Unauthorized': 'Secret delete key incorrect, unable to truncate table' }), 403
def __init__(self, dictionary, connection): #initalize trie self.trie = Trie() for line in open(dictionary): (yomi, lid, rid, cost, word) = line.strip().split("\t", 4) lid, rid, cost = int(lid), int(rid), int(cost) yomi, word = unicode(yomi, 'utf-8'), unicode(word, 'utf-8') self.trie.insert(yomi, (word, lid, rid, cost)) #initialize connection file = open(connection) lsize, rsize = file.readline().strip().split(" ", 1) lsize, rsize = int(lsize), int(rsize) self.connection = [None] * rsize for line in file: (lid, rid, cost) = line.strip().split(" ", 2) lid, rid, cost = int(lid), int(rid), int(cost) if lid != 0: break self.connection[rid] = cost file.close()
def _save_trie(rsc_dir, entries): """ 트라이를 저장한다. Args: rsc_dir: 대상 리소스 디렉토리 entries: 엔트리 리스트 """ trie = Trie() total_tag_nums = 0 for entry in entries: val = total_tag_nums val += 1 # 인덱스는 0이 아니라 1부터 시작한다. val *= 2 # 어절 완전일치의 경우 짝수 val += 1 if entry.is_pfx else 0 # 전망매칭 패턴의 경우 홀수 trie.insert(entry.word, val) total_tag_nums += len(entry.tag_nums) trie.save(f'{rsc_dir}/preanal.tri') val_file = f'{rsc_dir}/preanal.val' with open(val_file, 'wb') as fout: fout.write(struct.pack('H', 0)) # 인덱스가 1부터 시작하므로 dummy 데이터를 맨 앞에 하나 넣는다. for idx, entry in enumerate(entries, start=1): logging.debug('%d: %s: %s: %s', idx, entry.word, entry.tag_outs, entry.tag_nums) fout.write(struct.pack('H' * len(entry.tag_nums), *entry.tag_nums)) logging.info('value saved: %s', val_file) logging.info('total entries: %d', len(entries)) logging.info('expected size: %d', (sum([len(e.tag_nums) for e in entries])+1) * struct.Struct('H').size)
class TestTrie(unittest.TestCase): """ Python test ment to run on saved copy of slack JSON data """ def setUp(self): self.trie = Trie() #Default Slack JSON file name with open('users.json') as users: loaded = json.load(users) for member in loaded['members']: if 'real_name' not in member: continue self.trie.add_name(0, member['real_name'].lower(), member) def test_addition(self): """ Making Sure trie entries are added correctly """ added = self.trie.add('Jane Doe'.lower(), {'value': 0}) self.assertTrue(added) self.assertIsNotNone(self.trie.search('Jane Doe'.lower())) def test_search(self): """ Verifying Trie entries """ self.assertIsNone(self.trie.search('John Doe'.lower())) self.assertIsNotNone(self.trie.search('Aaron Long'.lower()))
class Thesaurus(object): """When initialize the Thesaurus, it will scan the word_bank.txt and establish the dictionary trie.""" def __init__(self, word_bank_path, tags): if not isinstance(tags, list): raise ValueError("'default_setting' must be dict!") self._trie = Trie() for line in open(word_bank_path, 'r'): item = strdecode(line).strip().split(' ') attr = {} for index in range(len(item) - 1): attr[tags[index]] = item[index + 1] self._trie.add_new_word(item[0], attr) def __len__(self): return self._trie.__len__() def __contains__(self, word): return self._trie.__contains__(word) def clear(self): self._trie = Trie() def has_word(self, word): """Return whether the thesaurus contains the word""" return self._trie.has_word(word) def get_attr(self, word): """Return the frequency of the word""" return self._trie.get_attr(word)
class TestPreprocess(unittest.TestCase): def setUp(self): self.common_prefix = ''.join( random.choices(string.ascii_letters + string.digits, k=16)) self.ending_1 = ''.join( random.choices(string.ascii_letters + string.digits, k=16)) self.ending_2 = ''.join( random.choices(string.ascii_letters + string.digits, k=16)) self.string_1 = self.common_prefix + self.ending_1 self.string_2 = self.common_prefix + self.ending_2 self.not_string = self.common_prefix + ''.join( random.choices(string.ascii_letters + string.digits, k=16)) self.root = TrieNode("") self.trie = Trie(self.root) self.trie.add_sentence(self.root, self.string_1) self.trie.add_sentence(self.root, self.string_2) def test_contains(self): """ Test to verify that Trie.contains() returns True on sentences that have been added to the Trie. Returns False for those that do not exist within the Trie. """ self.assertTrue(self.trie.contains(self.root, self.string_1)[0]) self.assertTrue(self.trie.contains(self.root, self.string_2)[0]) self.assertFalse(self.trie.contains(self.root, self.not_string)[0]) def test_return_completions_from_node(self): """ Test to verify that Trie.return_completions_from_node() correctly enumerates sentences that exist within the Trie,. """ node = self.trie.contains(self.root, self.common_prefix)[1] completions = self.trie.return_completions_from_node(node) # Although the unnittest method is misleadingly named, it actually checks if two arrays contain same elements self.assertCountEqual([self.ending_1, self.ending_2], completions)
def test_add(self): t = Trie() t.add('hello') self.assertEquals(t.trie, {'h': { 'e': { 'l': { 'l': { 'o': { '$': None } } } } }}) t.add('hell') self.assertEquals( t.trie, {'h': { 'e': { 'l': { 'l': { 'o': { '$': None }, '$': None } } } }})
def build_offline_data_model(max_suggestions, min_words_partial, input_filepath, output_filepath): """Perform all offline data processing steps build the data model. Includes: 1) Normalization of the agents' responses [normalize_responses.py]. 2) Building the data model [data_model.py] based on the normalized responses and the Trie class [trie.py]. 3) Storage (pickled file) of the built data model at output_filepath. Arguments --------- max_suggestions: int Maximum number of auto-complete suggestions. min_words_partial: int Minimum number of words in auto-complete suggestions of partial sentences. input_filepath: str Relative path of the JSON file with conversations. output_filepath: str Relative path of the pickled file to be saved. """ responses = extract_responses_from_JSON(input_filepath) print('Read in data. Starting processing of responses now...') ######################## Normalization ######################## start = time.time() # A signature is constructed for each sentence in the responses using # lemmatized forms of words. signature_to_text is a dict whose values # are sentences with identical signatures that are grouped together # and represented as repeated copies of the same normalized form. signature_to_text = get_signature_to_text_map(responses) # Next, we flatten nested lists of these normalized sentences: responses_processed = list(chain.from_iterable(signature_to_text.values())) end = time.time() duration = round(end - start, 2) print("Finished normalizing agent responses in {} s".format(duration)) ############## Creating and Saving the Data Model ############# start = time.time() # A trie (prefix tree) is used here to construct a data model. # A trie allows efficiently (w.r.t. time) accessing all strings # matching a prefix. autocomplete_trie = Trie(max_suggestions, min_words_partial) for response in responses_processed: # insert responses to grow the trie autocomplete_trie.insert_response(response) end = time.time() duration = round(end - start, 2) print('Inserted normalized responses into the trie in {} s.'.format( duration)) # Save the trie containing all agent responses start = time.time() autocomplete_trie.save(output_filepath) end = time.time() duration = round(end - start, 2) print( 'Trie with normalized responses saved at: {}'.format(output_filepath)) print('Serialization of the trie took {} s.'.format(duration)) return autocomplete_trie
def findWords(self, board: List[List[str]], words: List[str]) -> List[str]: def rec(r, c, word=""): if not (0 <= r < m and 0 <= c < n): return False if board[r][c] == '#': return word += board[r][c] if not trie.starts_with(word): return if trie.search(word): res.add(word) # need to continue searching board[r][c] = '#' rec(r - 1, c, word) rec(r + 1, c, word) rec(r, c - 1, word) rec(r, c + 1, word) board[r][c] = word[-1] trie = Trie() for w in words: trie.insert(w) m = len(board) n = len(board[0]) res = set() for i in range(m): for j in range(n): rec(i, j) return list(res)
def test_remove_one_but_not_both(self): trie = Trie() key1 = 'abc' key2 = 'abd' data1 = 123 data2 = 987 trie.insert(key1, data1) trie.insert(key2, data2) self.assertEqual(trie.remove(key1), data1) self.assertIsNone(trie.get(key1)) self.assertEqual(trie.get(key2), data2)
def test_match_prefix_4(self): trie = Trie() key_at = 'at' key_absent = 'absent' trie.insert(key_at) trie.insert(key_absent) matches = trie.match_prefix('a') self.assertEqual(len(matches), 2) self.assertTrue(key_at in matches) self.assertTrue(key_absent in matches)
def test_traverse_simple(): """Test traverse method.""" t = Trie() t.insert('water') t.insert('wash') gen = t.traverse('wa') trie_words = [] for _ in range(2): trie_words.append(next(gen)) for word in trie_words: assert word in ['wash', 'water']
def test_dict_updates(): """The dict of words stays updated.""" from trie import Trie t = Trie() t.insert('Apple') t.insert('Banana') t.insert('Stalin') assert 'Stalin' in t.dict_of_words
def main(): # arg parsing parser = argparse.ArgumentParser( description="Process arguments for data creation") parser.add_argument('-a', type=str, help="ip address", default=HOST) parser.add_argument('-p', type=int, help="port", default=PORT) args = parser.parse_args() # init Trie trie = Trie() # listen for connections with socket.socket(socket.AF_INET, socket.SOCK_STREAM) as s: print('Listening on ' + args.a + ':' + str(args.p)) s.bind((args.a, args.p)) s.listen() while True: conn, addr = s.accept() with conn: while True: # receive request size request_size = conn.recv(BUFFER_SIZE) if not request_size: break # ack for request size conn.sendall(OK.encode()) # read data from request request = conn.recv(BUFFER_SIZE) method = request.decode().split(" ")[0] while len(request) < int(request_size): data = conn.recv(BUFFER_SIZE) request = request + data # process request request = request.decode() if method == GET: get(conn=conn, request=request, trie=trie) elif method == PUT: put(conn=conn, request=request, trie=trie) elif method == QUERY: query(conn=conn, request=request, trie=trie) elif method == DELETE: delete(conn=conn, request=request, trie=trie)
class invertFile(): def __init__(self): self.keyTrie = Trie() self.occurence_list = [] self.list_length = 0 def put(self, key, freq, seq, fileAddress, pageAddress): ## PUT ELEMENT INTO INVERTFILE if self.keyTrie.isWordExist(key): ## THIS IS A EXIST KEY occurence_list_index = self.keyTrie.searchValue(key) self.occurence_list[occurence_list_index][1] += freq self.occurence_list[occurence_list_index][2].append( (fileAddress, freq, seq, pageAddress)) self.occurence_list[occurence_list_index][2].sort( key=lambda x: x[2]) else: ## THIS IS A NEW KEY ## CREATE OCCURENCE_LIST self.occurence_list.append( [key, freq, [(fileAddress, freq, seq, pageAddress)]]) ## PUT IT IN TRIE self.keyTrie.insert(key, self.list_length) self.list_length += 1 def get(self, key): ## GET A OCCURENCE LIST OF A GIVEN KEY return self.occurence_list[self.keyTrie.searchValue(key)] def showDictionary( self): ## PRINT ALL INVERTFILE. THIS IS A METHOD FOR TEST allkey = self.keyTrie.showAllKey() for key in allkey: print(key, self.keyTrie.searchValue(key), self.get(key)) def saveInvertFile( self, fileName="Occurence_List.dat"): ## SAVE THE ALL OCCURENCE LIST f = open(fileName, 'w') for [keyName, keyfreqency, Occurence_List] in self.occurence_list: f.write(keyName + "|||" + str(keyfreqency) + "|||") for (fileAddress, freqInPage, pageSeq, pageAddress) in Occurence_List: f.write(fileAddress + "|||" + str(freqInPage) + "|||" + str(pageSeq) + "|||" + pageAddress + "|||") f.write("\n") f.close() # inf = invertFile() # inf.put("nltk",10,"1.html",1,"www.q") # inf.put("language",6,"1.html",1,"www.q") # inf.put("nltk",10,"2.html",2,"www.e") # print(inf.get("nltk")) # inf.showDictionary()