def test_add(self): t = Trie() t.add('hello') self.assertEquals(t.trie, {'h': { 'e': { 'l': { 'l': { 'o': { '$': None } } } } }}) t.add('hell') self.assertEquals( t.trie, {'h': { 'e': { 'l': { 'l': { 'o': { '$': None }, '$': None } } } }})
class Bigram(): def __init__(self): self.minfreq = -3.14e+100 self.load() self.construct_Trie() def load(self): root = 'model_params' with open(root + '/bidic.json', 'r') as f: self.bidic = json.load(f) with open(root + '/pinyin.json', 'r') as f: self.pinyindic = json.load(f) def construct_Trie(self): self.trie = Trie() for key in self.pinyindic.keys(): self.trie.add(key) def construct_DAG(self, seq): # {key: list} self.DAG = {} for i in range(1, len(seq) - 1): self.DAG[i] = self.trie.scan(seq[i:-1], i) # BOS EOS self.DAG[len(seq) - 1] = [len(seq) - 1] self.DAG[0] = [0] def dp_search(self, seq): seq = '^' + seq + '$' self.construct_DAG(seq) # prob max viterbi = {} for i in range(len(seq)): viterbi[i] = {} # { i :{ end1: (prob, next), end2 : (prob, next) }} viterbi[len(seq) - 1][len(seq) - 1] = (0., len(seq)) # 反向DP for i in range(len(seq) - 2, -1, -1): # 对每个wi起始的词求最大概率 for x in self.DAG[i]: # P(wx+1...wy | wi..wx)*viterbi[x+1][index][0] prob_index = max( (self.bidic.get(seq[x + 1:y + 1], {}).get(seq[i:x + 1], self.minfreq) + viterbi.get(x + 1)[y][0], y) for y in self.DAG[x + 1]) viterbi[i][x] = prob_index # BOS end = max((self.bidic.get(seq[1:y + 1], {}).get(seq[0], self.minfreq) + viterbi.get(1)[y][0], y) for y in self.DAG[1])[1] # 回溯* start = 1 segs = [] while start < len(seq) - 1: segs.append(seq[start:end + 1]) temp = start start = end + 1 # print(viterbi[temp][end][0]) end = viterbi[temp][end][1] return segs
def trie_dictionary(dictionary): trie = Trie() for key in dictionary.keys(): #print key trie.add(key, dictionary[key]) return trie
def testExample(self): trie = Trie() trie.add("hack") trie.add("hackerrank") self.assertEqual(trie.find("hac"), 2) self.assertEqual(trie.find("hak"), 0)
def build_trie(words): """ Return a trie built from a set of 235,886 words """ print('building trie...') root = Trie() for i, word in enumerate(words): root.add(word, i) print('done\n') return root
def func(): # 等待tags和alias加载完毕 while not tags or not all_alias: time.sleep(0.1) cmd_trie, tag_trie = Trie(), Trie() cmd_trie.add(cfg.cmds) cmd_trie.add(all_alias['cmd'].keys()) tag_trie.add(tags.keys()) global tries tries = {'cmd': cmd_trie, 'tag': tag_trie}
def gen_suffix_trie(fname): from trie import Trie, DATrie trie = Trie() pytrie = DATrie() for s in valid_syllables: trie.add(s[::-1], valid_syllables[s]) pytrie.construct_from_trie(trie) pytrie.output_static_c_arrays(fname)
def benchmark_regex_trie(LINE): from trie import Trie import re trie = Trie() for key in KEYS: trie.add(key) regex = re.compile(trie.pattern()) print(regex.findall(LINE)) benchmark("regex.findall(LINE)", locals())
def gen_suffix_trie (fname): from trie import Trie, DATrie trie = Trie () pytrie = DATrie () for s in valid_syllables: trie.add (s[::-1], valid_syllables[s]) pytrie.construct_from_trie (trie) pytrie.output_static_c_arrays (fname)
class ControlIt: def __init__(self): self.the_trie = Trie() def file_grab(self): log = open("content/words1.txt", 'r') loglist = log.readlines() log.close() cl = [] sendc = [] pattern = '^\w+-\s' for index, line in enumerate(loglist, 0): if re.search(pattern, line) is not None: test = re.search(pattern, line) firstFound = test.string.split(' ', 1)[0] secondFound = loglist[index + 1] if not secondFound.split(' ', 1)[0] == "ship": sendc.append(secondFound.split(' ', 1)[0].lower()) cl.append( firstFound.__add__(secondFound.split(' ', 1)[0]).lower()) try: with open("content/words1.txt", encoding='UTF-8') as f: lines = f.read().translate( {ord(i): None for i in ';:*.,[]|"�<>()!°&¶/'}).lower().split() f.close() except: with open("content/words1.txt", encoding='latin_1') as f: lines = f.read().translate( {ord(i): None for i in ';:*.,[]|"�<>()!°&¶/'}).lower().split() f.close() for index, i in enumerate(lines): if i.count('-') >= 3 or i == re.search('\w+-\n', i): lines[index] = None [lines.append(i) for i in cl] for i in sendc: lines = list((x for x in lines if re.sub(i, "", str(x)))) return lines def grab_file(self): start_list = self.file_grab() for i in start_list: self.the_trie.add(self.the_trie.trie, str(i)) mytrie = self.the_trie.get_trie() self.the_trie.trie_pickle(mytrie) # self.ct = self.the_trie print(mytrie) def main(self): self.grab_file() self.the_trie.create_binary_tree() self.the_trie.create_avl_tree()
def index_emails(path_to_directory): trie = Trie() with multiprocessing.Pool() as p: paths = glob.glob(os.path.join(path_to_directory, '**/*.'), recursive=True) for pairs in tqdm(p.imap(index_path, paths), total=len(paths)): for (word, path) in pairs: trie.add(word, path) return trie
def mkEncTrie(dct): """ Makes an "encoding" trie from a description dict. Only call from mkTries to ensure that the reverse trie gets built as well """ rv=Trie() for key, value in dct.iteritems(): try: if value[1] is not NoReverse: raise Exception except: rv.add(key, value) else: rv.add(key, value[0]) return rv
def mkEncTrie(dct): """ Makes an "encoding" trie from a description dict. Only call from mkTries to ensure that the reverse trie gets built as well """ rv = Trie() for key, value in dct.iteritems(): try: if value[1] is not NoReverse: raise Exception except: rv.add(key, value) else: rv.add(key, value[0]) return rv
class ControlIt: def __init__(self): self.the_trie = Trie() def file_grab(self): log = open("content/words1.txt", 'r') loglist = log.readlines() log.close() cl = [] sendc = [] pattern = '^\w+-\s' for index, line in enumerate(loglist, 0): if re.search(pattern, line) is not None: test = re.search(pattern, line) firstFound = test.string.split(' ', 1)[0] secondFound = loglist[index + 1] if not secondFound.split(' ', 1)[0] == "ship": sendc.append(secondFound.split(' ', 1)[0].lower()) cl.append(firstFound.__add__(secondFound.split(' ', 1)[0]).lower()) try: with open("content/words1.txt", encoding='UTF-8') as f: lines = f.read().translate({ord(i): None for i in ';:*.,[]|"�<>()!°&¶/'}).lower().split() f.close() except: with open("content/words1.txt", encoding='latin_1') as f: lines = f.read().translate({ord(i): None for i in ';:*.,[]|"�<>()!°&¶/'}).lower().split() f.close() for index, i in enumerate(lines): if i.count('-') >= 3 or i == re.search('\w+-\n', i): lines[index] = None [lines.append(i) for i in cl] for i in sendc: lines = list((x for x in lines if re.sub(i, "", str(x)))) return lines def grab_file(self): start_list = self.file_grab() for i in start_list: self.the_trie.add(self.the_trie.trie, str(i)) mytrie = self.the_trie.get_trie() self.the_trie.trie_pickle(mytrie) # self.ct = self.the_trie print(mytrie) def main(self): self.grab_file() self.the_trie.create_binary_tree() self.the_trie.create_avl_tree()
class TestTrie(unittest.TestCase): """ Python test ment to run on saved copy of slack JSON data """ def setUp(self): self.trie = Trie() #Default Slack JSON file name with open('users.json') as users: loaded = json.load(users) for member in loaded['members']: if 'real_name' not in member: continue self.trie.add_name(0, member['real_name'].lower(), member) def test_addition(self): """ Making Sure trie entries are added correctly """ added = self.trie.add('Jane Doe'.lower(), {'value': 0}) self.assertTrue(added) self.assertIsNotNone(self.trie.search('Jane Doe'.lower())) def test_search(self): """ Verifying Trie entries """ self.assertIsNone(self.trie.search('John Doe'.lower())) self.assertIsNotNone(self.trie.search('Aaron Long'.lower()))
def __create_documents(self, directory): ''' recebe um diretorio ao qual sao realizadas as instanciacoes dos Documents() e adiciona suas palavras a trie. ''' archives = os.listdir(directory) documents = Lista() temp = Trie() for archive_name in archives: doc = Documents(directory + archive_name) documents.anexar(doc) for ngram in doc.ngrams: aux_words = ngram.sequence string = '' for k in aux_words: string += k + ' ' temp.add(string, doc) return documents, temp
class TrieTest(unittest.TestCase): def setUp(self): self.trie = Trie() self.trie.add('CAT') self.trie.add('CAR') self.trie.add('CART') self.trie.add('DO') self.trie.add('DOG') self.trie.add('X') def test_root_children(self): root = self.trie.root self.assertEqual(3, len(root.children)) self.assertTrue(root.children['C']) self.assertTrue(root.children['D']) self.assertTrue(root.children['X']) def test_single_letter(self): x_node = self.trie.root.children['X'] self.assertEqual(0, len(x_node.children)) self.assertTrue(x_node.is_word) def test_prefix_is_also_word(self): d_node = self.trie.root.children['D'] self.assertFalse(d_node.is_word) do_node = d_node.children['O'] self.assertTrue(do_node.is_word) dog_node = do_node.children['G'] self.assertTrue(dog_node.is_word) def test_multiple_branches_are_words(self): ca_node = self.trie.root.children['C'].children['A'] self.assertFalse(ca_node.is_word) cat_node = ca_node.children['T'] self.assertTrue(cat_node.is_word) car_node = ca_node.children['R'] self.assertTrue(car_node.is_word) cart_node = car_node.children['T'] self.assertTrue(cart_node.is_word)
class Router: def __init__(self): self.trie = Trie() def add_route(self, route, route_action): segments = ['/' + seg for seg in route.split('/')] self.trie.add(segments, route_action) def show_routes(self): root = self.trie.root th = [] for thing in self.trie.get_paths()(root.children.values()[0]): th.append(thing) return th def load_route(self, route): segments = ['/' + seg for seg in route.split('/')] action, bindings = self.trie.find(segments, {}) return functools.partial(action, **bindings)
class Autocomplete(object): """ This class will iplement the CharNode class to attempt to complete and autocomplete will most likely redo this in python""" def __init__(self): self.root = Trie() self.dictionary= [word.replace('\n', '') for word in open("/usr/share/dict/words", "r").readlines()] # self.dictionary =['app','apple', 'application', 'apply','apricot'] for word in self.dictionary: self.root.add(word) def autocompleteTest(self, word): curr = self.root.root for letter in word: curr = curr.children.get(letter) if curr is None: return # No words with given prefix word yield from curr.all_words_from_current_node(word)
def mkDecTrie(dct): """ Makes a "decoding", or "reverse" trie from a description dict Only call from mkTries to ensure that the encoding trie gets built as well """ rv=Trie() for key, value in dct.iteritems(): try: if value[1] is not NoReverse: raise Exception except: try: if not callable(value[1].find_prefix): raise Exception except: rv.add(value, key) else: new_key=value[0] new_val=(key,revTrie(value[1]))+value[2:] rv.add(new_key, new_val) return rv
def testAddSubstring(self): trie = Trie() trie.add("aberer") trie.add("ab") trie.add("a") self.assertEqual(trie.find("a"), 3) self.assertEqual(trie.find("ab"), 2) self.assertEqual(trie.find("aberer"), 1) trie.add("abe") self.assertEqual(trie.find("abe"), 2)
def mkDecTrie(dct): """ Makes a "decoding", or "reverse" trie from a description dict Only call from mkTries to ensure that the encoding trie gets built as well """ rv = Trie() for key, value in dct.iteritems(): try: if value[1] is not NoReverse: raise Exception except: try: if not callable(value[1].find_prefix): raise Exception except: rv.add(value, key) else: new_key = value[0] new_val = (key, revTrie(value[1])) + value[2:] rv.add(new_key, new_val) return rv
def read_volunteers(): """ Read all the volunters in and orchestrate their transformation """ group = None # Will hold Volunteer Objects user_trie = Trie() # Will contain complete slack user list in JSON with open("volunteers.csv") as volunteers: reader = csv.reader(volunteers) group = [Volunteer(line) for line in reader] group.pop(0) with open("config.yaml") as config: reader = yaml.load(config) user_list = get_users_slack(reader["slack"]) for user in user_list: if 'real_name' not in user: continue user_trie.add(user['real_name'].lower(), user) md_file = open('./volunteers.md', 'w') for gr in group: gr.parse_slack(user_trie) md_file.write(str(gr)) md_file.close()
class TestTire(TestCase): def setUp(self): self.trie = Trie() self.trie.add("David") self.trie.add("David") self.trie.add("Dave") self.trie.add("Davidson") def test_recall(self): self.assertEqual(self.trie.count("David"), 2) self.assertEqual(self.trie.overlay_count("David"), 3) with self.assertRaises(KeyError): self.trie.count("Ben") def test_exists(self): self.assertFalse(self.trie.exist("Davison")) self.assertTrue(self.trie.exist("David")) self.assertFalse(self.trie.exist("Dav")) def test_initializer(self): trie = Trie(["David", "David", "Dave", "Davidson"]) self.assertEqual(trie.count("David"), 2) self.assertEqual(trie.overlay_count("David"), 3) def test_random(self): name, count = self.trie.random() self.assertEqual(type(name), str) self.assertEqual(type(count), int)
def test_allwords(self): t = Trie() t.add('hello') t.add('help') t.add('hell') t.add('hall') self.assertSameSet(['hello', 'help', 'hell', 'hall'], t.allwords())
def test_lookup_prefix(self): t = Trie() t.add('hello') t.add('help') t.add('hell') t.add('hall') t.add('harp') self.assertSameSet(t.lookup('hel'), ['hello', 'help', 'hell'])
class Trial(IApplication): def __init__(self, frame): self.frame = frame self.trie = Trie() def initialize(self): pass def update(self): flag = False bls = self.frame.get(BadLink) print len(bls) for l in bls: self.trie.add(l.url, None) for t in sorted(Trie.flat_list, key=lambda x: x.count, reverse=True): print t.previous, t.count flag = True if flag: self.done = True def shutdown(self): print "Done"
def test_features(self): tr = Trie('the') tr.add('bar') tr.add('batt') tr.add('at') self.assertEqual(tr.count(), 4) self.assertEqual(tr.has('att'), False) self.assertEqual(tr.has('bar'), True) self.assertEqual(tr.has('value'), False)
def create_data_model(json_data): '''create serializable data from json data or if available load it''' if not os.path.exists(pickle_filename): # print "Pickle file does not exists" sample_conversations_data = json.load(open(json_data)) output_data = open(pickle_filename, "w") pickle.dump(sample_conversations_data, output_data) output_data.close() # print "file created and closed" # load the pickle file to get the json data input_data = open(pickle_filename, "r") # print "pickle file opened successfully" sample_conversations_data = pickle.load(input_data) # print "data loaded from pickle file" input_data.close() text_data = [] # get all the sentences in your text data table = string.maketrans("", "") # ???not needed??? for index, issue in enumerate(sample_conversations_data['Issues'][:]): for message in issue['Messages']: clean_text = string_preprocess(message['Text']) text_data.append(clean_text) trie = Trie() # create trie object to store your data # get the data in your trie object for sentence in text_data: trie.add(sentence) # create data model if it does not exist if not os.path.exists(pickle_data_model): data_model = open(pickle_data_model, "w") pickle.dump(trie, data_model) print "data model created" data_model.close()
def tab_cmd(target, t): '''命令行补全''' res, perfix = None, target if t == 'cmd': res = tries['cmd'].getStartBy(target) elif t == 'tag': res = tries['tag'].getStartBy(target) else: # 获取路径 basename = os.path.basename(target) perfix = addEscape(basename) target = parse_path(target) dirname = os.path.dirname(target) if os.path.isdir(dirname): paths = os.listdir(dirname) if t == 'dir': paths = list( filter( lambda path: os.path.isdir(os.path.join(dirname, path) ), paths)) elif t == 'file': paths = list( filter( lambda path: os.path.isfile(os.path.join( dirname, path)), paths)) paths = filter( lambda path: re.match(f'^{addEscape(basename)}', path, re.I), paths) # 匹配前缀,忽略大小写 # print(list(paths)) # print(addEscape(basename)) path_trie = Trie() path_trie.add(paths) res = path_trie.getStartBy(basename) else: res = [] return {'code': 0, 'res': res, 'perfix': perfix}
def testAddDeNovo(self): trie = Trie() trie.add("abl") trie.add("ac") trie.add("bca") trie.add("baa") self.assertEqual(trie.find("b"), 2) self.assertEqual(trie.find("a"), 2) self.assertEqual(trie.find("abl"), 1) self.assertEqual(trie.find("ac"), 1) self.assertEqual(trie.find("bca"), 1) self.assertEqual(trie.find("baa"), 1)
from trie import Trie from data import * from welcome import * from hashmap import HashMap from linkedlist import LinkedList # Printing the Welcome Message print_welcome() # Entering cuisine data food_types = Trie() eateries = HashMap(len(types)) for food in types: food_types.add(food) eateries.assign(food, LinkedList()) # restaurant data-point key names: eatery_cuisine = "cuisine" eatery_name = "name" eatery_price = "price" eatery_rating = "rating" eatery_address = "address" # Entering restaurant data for restaurant in restaurant_data: current_eateries_for_cuisine = eateries.retrieve(restaurant[0]) current_eatery_data = HashMap(len(restaurant)) current_eatery_data.assign(eatery_cuisine, restaurant[0]) current_eatery_data.assign(eatery_name, restaurant[1]) current_eatery_data.assign(eatery_price, restaurant[2]) current_eatery_data.assign(eatery_rating, restaurant[3]) current_eatery_data.assign(eatery_address, restaurant[4])
class TestWords(unittest.TestCase): """Tests for function words""" def setUp(self): unittest.TestCase.setUp(self) self.mytrie = Trie() self.mytrie.add('ant',1) self.mytrie.add('ante',2) self.mytrie.add('antic',3) self.mytrie.add('antsy',4) self.mytrie.add('antse',5) self.mytrie.add('ban',6) self.mytrie.add('banana',7) def test_default_case(self): """Test words retrieves all words properly from Trie.""" expected = ['ante','antic','ant','antsy','antse','banana','ban'] actual = [] for words in self.mytrie.words(): actual.append(words) #print 'actual',actual #print 'expected',expected self.assertTrue(sorted(actual)==sorted(expected))
def make_trie(items): trie = Trie() for key, item in items: if _debug: print ": ".join([x.encode('UTF-8') for x in key, item]) trie.add(*(unicodedata.normalize('NFC', x) for x in (key, item))) return trie
# Given a book of words. Assume you have enough main memory to accommodate all words. design a data structure to find top K maximum occurring words. # The data structure should be dynamic so that new words can be added. from trie import Trie def find_k_most_frequent(word: str): most_frequents = [] return most_frequents if __name__ == '__main__': teststr = "the a there anaswe any by their" # find_k_most_frequent(teststr) t = Trie() for s in teststr.split(' '): print("Adding({})".format(s)) t.add(s) # Search for different keys print("{} ---- {}".format("the", t.search("the"))) print("{} ---- {}".format("these", t.search("these"))) print("{} ---- {}".format("their", t.search("their"))) print("{} ---- {}".format("thaw", t.search("thaw")))
def create_dictionary(): with open(DICTIONARY_FILE) as f: global dictionary dictionary = Trie() for word in f: dictionary.add(word.strip())