def setUp(self): """ 测试前准备环境的搭建(setUp) :return: """ self.words = 'an ant all allot alloy aloe are ate be'.split() self.trie = SortedStringTrie(zip(self.words, range(len(self.words))))
def split_file( in_file, out_file, fields="bgg_user_name", trie_file=None, limits=LIMIT, construct=False, ): """ split input file along prefixes """ trie = None if trie_file and not construct: LOGGER.info("loading trie from file <%s>...", trie_file) trie = _trie_from_file(trie_file) if not trie: LOGGER.info("making trie for <%s>...", in_file) full_trie = _make_trie(file=in_file, fields=fields) limits = tuple(arg_to_iter(limits)) or (LIMIT, ) for limit in limits: trie = Trie(_prefixes(full_trie, limit=limit)) LOGGER.info("%d prefixes using limit %d", len(trie), limit) out_path = trie_file.format(limit=limit) if trie_file else None if not out_path or out_path == "-": for prefix, count in trie.items(): print(f"{prefix}\t{count}") else: with open(out_path, "w") as file_obj: for prefix, count in trie.items(): file_obj.write(f"{prefix}\t{count}\n") LOGGER.info("constructed trie of size %d", len(trie)) _save_to_prefixes(dst=out_file, trie=trie, file=in_file, fields=fields)
def __init__(self, *args, **kwargs): self._dict = {} self._trie = Trie(*args, **kwargs) d = dict(*args, **kwargs) for key, value in d.items(): self._dict[case_insensitive(key)] = value
def __init__(self, *args, **kwargs): # pylint: disable=super-init-not-called self._dict = {} self._trie = Trie(*args, **kwargs) d = dict(*args, **kwargs) for key, value in d.iteritems(): self._dict[case_insensitive(key)] = value
def context_entities(setname, domain_set): for domain in domain_set: with open( JSON_OUTPUT_DIR + '/mentions/{}/{}.json'.format(setname, domain), 'r') as f: # add entities to this trie trie = Trie() for ent_id in domain2entities[domain]: # 不考虑multi category的情况 trie.setdefault(entities2name[ent_id].lower(), 0) # 考虑multi category的情况 # trie.setdefault(entities2alias[ent_id], 0) total_alias_count = 0 total_doc_count = 0 matched_text_list = [] for line in tqdm(f, desc='processing {}...'.format(domain)): datum = json.loads(line.strip()) text = decode(datum['mention_context_tokens']) total_doc_count += 1 i = 0 matched_text = '' while i < len(text): item = trie.longest_prefix_item(text[i:], default=None) if item is not None: prefix, key_id = item if (i == 0 or text[i-1] == ' ') \ and (i+len(prefix) == len(text) or text[i+len(prefix)] == ' '): total_alias_count += 1 i += len(prefix) matched_text += '##' + prefix + '##' else: matched_text += prefix i += len(prefix) else: matched_text += text[i] i += 1 matched_text_list.append(matched_text) print('Avg alias count {:.2f} in {} documents.'.format( total_alias_count / total_doc_count, domain)) with open( JSON_OUTPUT_DIR + '/mentions/{}/{}_matched.json'.format(setname, domain), 'w') as f: for text in matched_text_list: f.write(text + '\n\n')
def create_syllable_guide(self): syllable_count = SortedStringTrie() syllable_pronounciation = SortedStringTrie() for line in self.dictionary: index_space = line.find(" ") word = line[:index_space] pronounciation = line[index_space + 2:] syllables = self.count_syllables(pronounciation) syllable_count.__setitem__(word, syllables) syllable_pronounciation.__setitem__(word, pronounciation) return syllable_count, syllable_pronounciation
def createTrie(self, filePath): trie = Trie() # with open("/Users/nali/Downloads/final_dict.txt", 'r') as file: with open(filePath, 'r') as file: lines = file.readlines() flag = 0 for line in lines: flag += 1 parts = line.strip().split('\t') if (len(parts) > 1): words = parts[1].split(';') for word in words: if (word != ''): if (trie.get(word) == None): trie[word] = set() trie[word].add(parts[0]) else: trie[word].add(parts[0]) return trie
def init_banned_tree(): global BANNED_T if BANNED_T: return BANNED_T with open(banned_text, 'rb') as fp: data = fp.read() if not isinstance(data, unicode): data = data.decode('utf-8') words = data.split(SPLIT) BANNED_T = trie.fromkeys(words, 1) return BANNED_T
def create_syllable_guide(self): syllable_count = SortedStringTrie() syllable_pronounciation = SortedStringTrie() for line in self.dictionary: index_space = line.find(" ") word = line[:index_space] pronounciation = line[index_space + 2 :] syllables = self.count_syllables(pronounciation) syllable_count.__setitem__(word, syllables) syllable_pronounciation.__setitem__(word, pronounciation) return syllable_count, syllable_pronounciation
class CaseInsensitiveDict(MutableMapping): """ This is a dictionary that is case insensitive for lookups. It preserves the case of the keys. It also allows looking up the keys by prefix. """ def __init__(self, *args, **kwargs): # pylint: disable=super-init-not-called self._dict = {} self._trie = Trie(*args, **kwargs) d = dict(*args, **kwargs) for key, value in d.iteritems(): self._dict[case_insensitive(key)] = value def __contains__(self, key): return case_insensitive(key) in self._dict def __delitem__(self, key): cl = case_insensitive(key) del self._trie[key.lower()] del self._dict[cl] def __iter__(self): return iter(self._dict) def __len__(self): return len(self._dict) def __getitem__(self, key): return self._dict[case_insensitive(key)] def __setitem__(self, key, value): cl = case_insensitive(key) self._trie[key.lower()] = value self._dict[cl] = value def has_key(self, key): return self.__contains__(key.lower()) def prefixed_keys(self, prefix): return self._trie.keys(prefix.lower())
def setUp(self): self.words = 'an ant all allot alloy aloe are ate be'.split() self.trie = SortedStringTrie(zip(self.words, range(len(self.words))))
class TestTrie(unittest.TestCase): def setUp(self): self.words = 'an ant all allot alloy aloe are ate be'.split() self.trie = SortedStringTrie(zip(self.words, range(len(self.words)))) def test_longest_prefix(self): self.assertEqual(self.trie.longest_prefix('antonym'), 'ant') self.assertEqual(self.trie.longest_prefix('are'), 'are') self.assertEqual(self.trie.longest_prefix('alla'), 'all') self.assertEqual(self.trie.longest_prefix('allo'), 'all') self.assertRaises(KeyError, self.trie.longest_prefix_item, 'alumni') self.assertEqual(self.trie.longest_prefix('alumni', default=None), None) self.assertEqual(self.trie.longest_prefix('linux', default=-1), -1) def test_longest_prefix_value(self): self.assertEqual(self.trie.longest_prefix_value('antonym'), 1) self.assertEqual(self.trie.longest_prefix_value('are'), 6) self.assertEqual(self.trie.longest_prefix_value('alla'), 2) self.assertEqual(self.trie.longest_prefix_value('allo'), 2) self.assertRaises(KeyError, self.trie.longest_prefix_value, 'alumni') self.assertEqual( self.trie.longest_prefix_value('alumni', default=None), None) self.assertEqual(self.trie.longest_prefix_value('linux', default=-1), -1) def test_longest_prefix_item(self): self.assertEqual(self.trie.longest_prefix_item('antonym'), ('ant', 1)) self.assertEqual(self.trie.longest_prefix_item('are'), ('are', 6)) self.assertEqual(self.trie.longest_prefix_item('alla'), ('all', 2)) self.assertEqual(self.trie.longest_prefix_item('allo'), ('all', 2)) self.assertRaises(KeyError, self.trie.longest_prefix_item, 'alumni') self.assertEqual(self.trie.longest_prefix_item('alumni', default=None), None) self.assertEqual(self.trie.longest_prefix_item('linux', default=-1), -1) def test_iter_prefixes(self): self.assertEqual(list(self.trie.iter_prefixes('antonym')), ['an', 'ant']) self.assertEqual(list(self.trie.iter_prefixes('are')), ['are']) self.assertEqual(list(self.trie.iter_prefixes('alumni')), []) def test_iter_prefix_values(self): self.assertEqual(list(self.trie.iter_prefix_values('antonym')), [0, 1]) self.assertEqual(list(self.trie.iter_prefix_values('are')), [6]) self.assertEqual(list(self.trie.iter_prefix_values('alumni')), []) def test_iter_prefix_items(self): self.assertEqual(list(self.trie.iter_prefix_items('antonym')), [('an', 0), ('ant', 1)]) self.assertEqual(list(self.trie.iter_prefix_items('are')), [('are', 6)]) self.assertEqual(list(self.trie.iter_prefix_items('alumni')), []) def test_keys_wprefix(self): self.assertEqual(self.trie.keys('al'), ['all', 'allot', 'alloy', 'aloe']) self.assertEqual(self.trie.keys('are'), ['are']) self.assertEqual(self.trie.keys('ann'), []) def test_values_wprefix(self): self.assertEqual(self.trie.values('al'), [2, 3, 4, 5]) self.assertEqual(self.trie.values('are'), [6]) self.assertEqual(self.trie.values('ann'), []) def test_items_wprefix(self): self.assertEqual(self.trie.items('al'), [('all', 2), ('allot', 3), ('alloy', 4), ('aloe', 5)]) self.assertEqual(self.trie.items('are'), [('are', 6)]) self.assertEqual(self.trie.items('ann'), []) def test_consistency_wprefix(self): t = self.trie for prefix in 'al', 'are', 'ann': self.assertEqual(t.items(prefix), zip(t.keys(prefix), t.values(prefix))) def test_pickle(self): from pickle import dumps, loads, HIGHEST_PROTOCOL for proto in xrange(HIGHEST_PROTOCOL): unpickled = loads(dumps(self.trie, proto)) self.assertEqual(self.trie, unpickled) self.assert_(type(self.trie) is type(unpickled)) self.assert_(self.trie is not unpickled) def test_repr(self): evaled = eval(repr(self.trie)) self.assertEqual(evaled, self.trie) self.assertEqual(evaled.__class__, self.trie.__class__)
class TestTrie(unittest.TestCase): def setUp(self): self.words = 'an ant all allot alloy aloe are ate be'.split() self.trie = SortedStringTrie(zip(self.words, range(len(self.words)))) def test_longest_prefix(self): self.assertEqual(self.trie.longest_prefix('antonym'), 'ant') self.assertEqual(self.trie.longest_prefix('are'), 'are') self.assertEqual(self.trie.longest_prefix('alla'), 'all') self.assertEqual(self.trie.longest_prefix('allo'), 'all') self.assertRaises(KeyError, self.trie.longest_prefix_item, 'alumni') self.assertEqual(self.trie.longest_prefix('alumni', default=None), None) self.assertEqual(self.trie.longest_prefix('linux', default=-1), -1) def test_longest_prefix_value(self): self.assertEqual(self.trie.longest_prefix_value('antonym'), 1) self.assertEqual(self.trie.longest_prefix_value('are'), 6) self.assertEqual(self.trie.longest_prefix_value('alla'), 2) self.assertEqual(self.trie.longest_prefix_value('allo'), 2) self.assertRaises(KeyError, self.trie.longest_prefix_value, 'alumni') self.assertEqual(self.trie.longest_prefix_value('alumni', default=None), None) self.assertEqual(self.trie.longest_prefix_value('linux', default=-1), -1) def test_longest_prefix_item(self): self.assertEqual(self.trie.longest_prefix_item('antonym'), ('ant', 1)) self.assertEqual(self.trie.longest_prefix_item('are'), ('are', 6)) self.assertEqual(self.trie.longest_prefix_item('alla'), ('all', 2)) self.assertEqual(self.trie.longest_prefix_item('allo'), ('all', 2)) self.assertRaises(KeyError, self.trie.longest_prefix_item, 'alumni') self.assertEqual(self.trie.longest_prefix_item('alumni', default=None), None) self.assertEqual(self.trie.longest_prefix_item('linux', default=-1), -1) def test_iter_prefixes(self): self.assertEqual(list(self.trie.iter_prefixes('antonym')), ['an', 'ant']) self.assertEqual(list(self.trie.iter_prefixes('are')), ['are']) self.assertEqual(list(self.trie.iter_prefixes('alumni')), []) def test_iter_prefix_values(self): self.assertEqual(list(self.trie.iter_prefix_values('antonym')), [0, 1]) self.assertEqual(list(self.trie.iter_prefix_values('are')), [6]) self.assertEqual(list(self.trie.iter_prefix_values('alumni')), []) def test_iter_prefix_items(self): self.assertEqual(list(self.trie.iter_prefix_items('antonym')), [('an', 0), ('ant', 1)]) self.assertEqual(list(self.trie.iter_prefix_items('are')), [('are', 6)]) self.assertEqual(list(self.trie.iter_prefix_items('alumni')), []) def test_keys_wprefix(self): self.assertEqual(self.trie.keys('al'), ['all','allot','alloy','aloe']) self.assertEqual(self.trie.keys('are'), ['are']) self.assertEqual(self.trie.keys('ann'), []) def test_values_wprefix(self): self.assertEqual(self.trie.values('al'), [2,3,4,5]) self.assertEqual(self.trie.values('are'), [6]) self.assertEqual(self.trie.values('ann'), []) def test_items_wprefix(self): self.assertEqual(self.trie.items('al'), [('all',2),('allot',3),('alloy',4),('aloe',5)]) self.assertEqual(self.trie.items('are'), [('are',6)]) self.assertEqual(self.trie.items('ann'), []) def test_consistency_wprefix(self): t = self.trie for prefix in 'al','are','ann': self.assertEqual( t.items(prefix), list(zip(t.keys(prefix), t.values(prefix))) ) def test_pickle(self): from pickle import dumps, loads, HIGHEST_PROTOCOL for proto in range(HIGHEST_PROTOCOL): unpickled = loads(dumps(self.trie, proto)) self.assertEqual(self.trie, unpickled) self.assertTrue(type(self.trie) is type(unpickled)) self.assertTrue(self.trie is not unpickled) def test_repr(self): evaled = eval(repr(self.trie)) self.assertEqual(evaled, self.trie) self.assertEqual(evaled.__class__, self.trie.__class__)
class _Repository: """Copied from https://stackoverflow.com/questions/6255050/python-thinking-of-a-module-and-its-variables-as-a-singleton-clean-approach.""" _shared_state = dict() _arg_trie = SortedStringTrie() _registries = dict() @classmethod def reset(cls): cls._shared_state.clear() cls._arg_trie = SortedStringTrie() cls._registries.clear() def __init__(self): self.__dict__ = self._shared_state def add_argument(self, name, *aliases, scope=None, dtype=str, default=None, nargs=1, msg='', choices=None): if scope is None: raise ArgumentScopeNotSupplied( 'You have to explicitly set scope to a value.') arg = Argument(name, *aliases, scope=scope, dtype=dtype, default=default, nargs=nargs, msg=msg, choices=choices) if arg.name in self.__dict__: raise DuplicateArgument( f'An argument named "{arg.name}" has been declared.') if arg.name in SUPPORTED_VIEW_ATTRS: raise ReservedNameError( f'Name "{arg.name}" is reserved for something else.') self.__dict__[arg.name] = arg self._arg_trie[ arg. name] = arg # NOTE(j_luo) This is class attribute, therefore not part of __dict__. if dtype == bool: self._arg_trie[f'no_{arg.name}'] = arg return arg def set_argument(self, name, value, *, _force=False): if not _force: raise MustForceSetArgument( f'You must explicitliy set _force to True in order to set an argument.' ) arg = self._get_argument_by_string(name) arg.value = value def add_registry(self, registry, scope): try: arg = self.add_argument(registry.name, scope=scope, dtype=str) except DuplicateArgument: raise DuplicateRegistry( f'A registry named "{registry.name}" already exists.') self._registries[arg.name] = registry @property def configs(self) -> Dict[str, str]: ret = dict() for name, registry in self._registries.items(): arg = self._get_argument_by_string(name) ret[name] = arg.value return ret def get_view(self): return _RepositoryView(self._shared_state) def _get_argument_by_string(self, name, source='CLI'): args = self._arg_trie.values(prefix=name) if len(args) > 1: found_names = [f'"{arg.name}"' for arg in args] raise MultipleMatches( f'Found more than one match for name "{name}": {", ".join(found_names)}.' ) elif len(args) == 0: raise MatchNotFound( f'Found no argument named "{name}" from "{source}".') arg = args[0] return arg def parse_args(self, known_only=False): arg_groups = list() group = list() for seg in sys.argv[1:]: if seg.startswith('-'): if group: arg_groups.append(group) group = seg.split('=') else: group.append(seg) if group: arg_groups.append(group) # Parse the CLI string first. parsed = list() for group in arg_groups: name, *values = group name = name.strip('-') # NOTE(j_luo) Help mode. Note that if known_only is True, then help is ignored. if name == 'h' or name == 'help': if not known_only: _print_all_args(log_also=False, and_exit=True) continue # NOTE(j_luo) Some other args might start with "h"! try: arg = self._get_argument_by_string(name, source='CLI') if arg.dtype == bool: new_value = [not name.startswith('no_')] + values else: new_value = values parsed.append((arg, new_value)) except MatchNotFound as e: if not known_only: raise e # Deal with config files and use their values to set the new default values. cfg_names = set() for arg, new_value in parsed: if arg.name in self._registries: arg.value = new_value reg = self._registries[arg.name] cfg_cls = reg[arg.value] cfg = vars(cfg_cls()) for cfg_name, cfg_value in cfg.items(): cfg_arg = self._get_argument_by_string( cfg_name, source=cfg_cls.__name__) cfg_arg.value = cfg_value if cfg_name in cfg_names: raise OverlappingRegistries( f'Argument named "{cfg_name}" has been found in multiple registries.' ) cfg_names.add(cfg_name) # Set the remaning CLI arguments. for arg, new_value in parsed: if arg.name not in self._registries: arg.value = new_value return g
class TestTrie(unittest.TestCase): def setUp(self): self.words = "an ant all allot alloy aloe are ate be".split() self.trie = SortedStringTrie(zip(self.words, range(len(self.words)))) def test_longest_prefix(self): self.assertEqual(self.trie.longest_prefix("antonym"), "ant") self.assertEqual(self.trie.longest_prefix("are"), "are") self.assertEqual(self.trie.longest_prefix("alla"), "all") self.assertEqual(self.trie.longest_prefix("allo"), "all") self.assertRaises(KeyError, self.trie.longest_prefix_item, "alumni") self.assertEqual(self.trie.longest_prefix("alumni", default=None), None) self.assertEqual(self.trie.longest_prefix("linux", default=-1), -1) def test_longest_prefix_value(self): self.assertEqual(self.trie.longest_prefix_value("antonym"), 1) self.assertEqual(self.trie.longest_prefix_value("are"), 6) self.assertEqual(self.trie.longest_prefix_value("alla"), 2) self.assertEqual(self.trie.longest_prefix_value("allo"), 2) self.assertRaises(KeyError, self.trie.longest_prefix_value, "alumni") self.assertEqual(self.trie.longest_prefix_value("alumni", default=None), None) self.assertEqual(self.trie.longest_prefix_value("linux", default=-1), -1) def test_longest_prefix_item(self): self.assertEqual(self.trie.longest_prefix_item("antonym"), ("ant", 1)) self.assertEqual(self.trie.longest_prefix_item("are"), ("are", 6)) self.assertEqual(self.trie.longest_prefix_item("alla"), ("all", 2)) self.assertEqual(self.trie.longest_prefix_item("allo"), ("all", 2)) self.assertRaises(KeyError, self.trie.longest_prefix_item, "alumni") self.assertEqual(self.trie.longest_prefix_item("alumni", default=None), None) self.assertEqual(self.trie.longest_prefix_item("linux", default=-1), -1) def test_iter_prefixes(self): self.assertEqual(list(self.trie.iter_prefixes("antonym")), ["an", "ant"]) self.assertEqual(list(self.trie.iter_prefixes("are")), ["are"]) self.assertEqual(list(self.trie.iter_prefixes("alumni")), []) def test_iter_prefix_values(self): self.assertEqual(list(self.trie.iter_prefix_values("antonym")), [0, 1]) self.assertEqual(list(self.trie.iter_prefix_values("are")), [6]) self.assertEqual(list(self.trie.iter_prefix_values("alumni")), []) def test_iter_prefix_items(self): self.assertEqual(list(self.trie.iter_prefix_items("antonym")), [("an", 0), ("ant", 1)]) self.assertEqual(list(self.trie.iter_prefix_items("are")), [("are", 6)]) self.assertEqual(list(self.trie.iter_prefix_items("alumni")), []) def test_keys_wprefix(self): self.assertEqual(self.trie.keys("al"), ["all", "allot", "alloy", "aloe"]) self.assertEqual(self.trie.keys("are"), ["are"]) self.assertEqual(self.trie.keys("ann"), []) def test_values_wprefix(self): self.assertEqual(self.trie.values("al"), [2, 3, 4, 5]) self.assertEqual(self.trie.values("are"), [6]) self.assertEqual(self.trie.values("ann"), []) def test_items_wprefix(self): self.assertEqual(self.trie.items("al"), [("all", 2), ("allot", 3), ("alloy", 4), ("aloe", 5)]) self.assertEqual(self.trie.items("are"), [("are", 6)]) self.assertEqual(self.trie.items("ann"), []) def test_consistency_wprefix(self): t = self.trie for prefix in "al", "are", "ann": self.assertEqual(t.items(prefix), zip(t.keys(prefix), t.values(prefix))) def test_pickle(self): from pickle import dumps, loads, HIGHEST_PROTOCOL for proto in xrange(HIGHEST_PROTOCOL): unpickled = loads(dumps(self.trie, proto)) self.assertEqual(self.trie, unpickled) self.assert_(type(self.trie) is type(unpickled)) self.assert_(self.trie is not unpickled) def test_repr(self): evaled = eval(repr(self.trie)) self.assertEqual(evaled, self.trie) self.assertEqual(evaled.__class__, self.trie.__class__)
class application(tkinter.Frame): top_frame = None homepage_frame = None results_frame = None results_text = None __s_engine = search_engine.crawler() bg_image = os.path.join('resources', 'background_1.jpg') search_history = os.path.join('data', 'search_history.txt') search_freq = os.path.join('data', 'search_freq.txt') history = SortedStringTrie() def __init__(self, master): #for GUI tkinter.Frame.__init__(self, master) self.master = master self.search_query = tkinter.StringVar() self.search_query.trace("w", self.update_listbox) self.initUI() #for loading search history self.__restore_history() #for loading data into crawler self.__s_engine.restore() self.__s_engine.calc_pagerank(30) def initUI(self): self.master.title("SPIDER") self.master.state('zoomed') self.grid() #set icon icon = tkinter.PhotoImage(file=os.path.join('resources', "logo_t.png")) self.master.tk.call('wm', 'iconphoto', self.master._w, icon) application.top_frame = tkinter.Frame(self.master, bg='black', bd=3, highlightcolor='blue') application.top_frame.grid(row=0) #tool bar on top of master self.tabs = tkinter.Label(application.top_frame, text="reserved for tabs") self.tabs.grid(row=0) self.options = tkinter.PhotoImage( file=os.path.join('resources', "options_3.png")) self.options = self.options.subsample(20, 20) self.options_button = tkinter.Button(application.top_frame, relief='flat', image=self.options, command=self.hide) self.options_button.grid(row=1, column=8) self.bookmark = tkinter.PhotoImage( file=os.path.join('resources', "bookmark_1.png")) self.bookmark = self.bookmark.subsample(40, 40) self.bookmark_button = tkinter.Button(application.top_frame, relief='flat', bg='white', image=self.bookmark, command=self.hide) self.bookmark_button.grid(row=1, column=7) self.search_box1 = tkinter.Entry(application.top_frame, textvariable=self.search_query, bg='white', fg='black', selectborderwidth=2, width=200) self.search_box1.grid(row=1, column=0) self.search_box1.bind('<Return>', self.__get) #self.search_box.bind('<Configure>',self._resize_search_box) #homepage self.homepage_frame = tkinter.Frame() self.homepage = self.create_homepage() self.homepage_frame.grid(row=2) #results page self.results_frame = tkinter.Frame() def hide(self): self.frame.pack_forget() '''def _resize_search_box(self,event): new_width = event.width self.search_box.destroy() #if new_width > 70: self.search_box = tkinter.Entry(application.top_frame,bg = 'white',fg = 'black',selectborderwidth = 1,bd = 10,font = ("COPRGTL.TTF"),relief = 'flat',width = new_width) #else: # self.search_box = tkinter.Entry(application.top_frame,bg = 'white',fg = 'black',selectborderwidth = 1,bd = 4,font = ("COPRGTL.TTF"),relief = 'flat',width = 77)''' def create_homepage(self): background_image = Image.open(self.bg_image) background_image = background_image.resize((1536, 841)) self.background_photo = ImageTk.PhotoImage(background_image) self.background_label = tkinter.Label(self.homepage_frame, image=self.background_photo) self.background_label.grid(row=1) #name self.name = tkinter.Label(self.homepage_frame, text=" ADS ", bg='black', fg='white', font=("SHOWG.TTF", 60)) self.name.grid() self.name.place(x=620, y=275) #search box self.search_box = tkinter.Entry(self.homepage_frame, textvariable=self.search_query, bg='white', fg='black', selectborderwidth=1, bd=6, font=("COPRGTL.TTF", 16), relief='flat', width=40) self.search_box.bind('<Return>', self.__get) self.search_box.grid() self.search_box.place(x=525, y=400) #centre 541,403.5 #search image self.search_image = tkinter.PhotoImage(file=os.path.join( 'resources', "search_4.png")) #1-16 /4 - 8 self.search_image = self.search_image.subsample(8, 8) self.search_button = tkinter.Button(self.homepage_frame, image=self.search_image, bg='white', relief='flat') self.search_button.grid(row=1) self.search_button.place(x=1020, y=400) #listbox self.listbox = tkinter.Listbox(self.homepage_frame, selectborderwidth=2, width=0, height=0) #self.listbox.bind('<Configure>',self.resize_listbox) self.listbox.grid(row=1) self.listbox.place(x=525, y=439) def __get(self, event): input_str = self.search_box.get() if input_str in self.history: self.history[input_str] += 0.1 else: self.history[input_str] = 0.1 results = self.__s_engine.query(input_str) self.homepage_frame.grid_forget() self.resultspage = self.create_resultspage(results) self.results_frame.grid() def update_listbox(self, *args): text = self.search_query.get() if text.isprintable(): relevent_words = self.__from_history( text) #search the term from history ;return a list self.listbox.delete(0, self.listbox.size()) for each_item in relevent_words: self.listbox.insert(self.listbox.size(), each_item) def __from_history(self, query): if query in self.history: return sorted(self.history, key=self.history.values) return def create_resultspage(self, results): if self.results_text: self.results_text.grid_forget() self.results_text = tkinter.Label(self.results_frame, text="results\n") self.results_text.grid() self.results = tkinter.Label(self.results_frame, font=('', 20)) self.results.grid(row=1, column=0) s = '' for each_url in results: s += each_url + '\n' # arr = box(self.results_frame,each_url) self.results.config(text=s) def __restore_history(self): words_file = open(search_history) freq_file = open(search_freq) words = words_file.read() freq = freq_file.read() words_list = [] freq_list = [] if words: words_list = json.loads(words) if freq: freq_list = json.loads(freq) words_file.close() freq_file.close() for loop_var in range(len(words_list)): self.history[words_list[loop_var]] = freq_list[loop_var] def store_history(self): words_file = open(search_history, 'w') freq_file = open(search_freq, 'w') words_list = self.history.keys() freq_list = self.history.values() words_file.write(json.dumps(words_list)) freq_file.write(json.dumps(freq_list)) words_file.close() freq_file.close()
def reset(cls): cls._shared_state.clear() cls._arg_trie = SortedStringTrie() cls._registries.clear()
class FastTrieLookup(object): def __init__(self): self.trie = SortedStringTrie() def _add_name(self, name, value_dict): """ Adds a name and associated value dict to trie. value_dict should be a dict {person.id: person.full_name} """ name = name.lower() if name in self.trie: # Name already exists (maybe some other person has same first/last name) self.trie[name].update(value_dict) else: self.trie[name] = value_dict def _delete_value_for_name(self, name, value_dict_key): """ Deletes a name's associated value_dict from trie. If given name doesn't have any more values left, it is deleted as well. `value_dict_key` should be person.id. """ name = name.lower() if name in self.trie: values = self.trie[name] # Remove element from `values` (which is a dict) with given `value_dict_key` values.pop(value_dict_key, None) # Check if values is empty. Delete Trie node if yes, otherwise store remaining values for given name if values: self.trie[name] = values else: del self.trie[name] else: raise KeyError(name) def add_person(self, person): """ Adds a person's all name attributes to trie and associates them with person's value dict {id: full_name}. """ value_dict = {person.id: person.full_name} if person.title: self._add_name(person.title, value_dict) if person.first_name: self._add_name(person.first_name, value_dict) if person.middle_name: self._add_name(person.middle_name, value_dict) if person.last_name: self._add_name(person.last_name, value_dict) if person.suffix: self._add_name(person.suffix, value_dict) def remove_person(self, person): """ Removes a persons's all name attributes and associated value dicts from trie. """ if person.title: self._delete_value_for_name(person.title, person.id) if person.first_name: self._delete_value_for_name(person.first_name, person.id) if person.middle_name: self._delete_value_for_name(person.middle_name, person.id) if person.last_name: self._delete_value_for_name(person.last_name, person.id) if person.suffix: self._delete_value_for_name(person.suffix, person.id) def get_persons_by_prefix(self, prefix): """ Returns all value dicts which are associated with a name with given prefix. """ prefix = prefix.lower() return self.trie.values(prefix)
def __init__(self): self.full_match = Trie() self.repairs_match = Trie() self.full_sentences = {}
def create_training_instances(input_file, max_seq_length, tokenizer, rng, alias2entities): def is_whitespace(c): if c == " " or c == "\t" or c == "\r" or c == "\n" or ord(c) == 0x202F: return True return False """Create `TrainingInstance`s from raw text.""" all_documents = [] all_alias_token_spans = [] from pytrie import SortedStringTrie as Trie trie = Trie() # add entities to this trie for alias, ents in alias2entities.items(): trie.setdefault(alias, 0) with open(input_file, "r") as reader: for line in tqdm(reader, desc='converting tokens'): line = tokenization.convert_to_unicode(line.strip()) line = json.loads(line)['text'] tokens = [] if do_lower_case: line = line.lower() char_to_word_offset = [] prev_is_whitespace = True for c in line: if is_whitespace(c): prev_is_whitespace = True else: if prev_is_whitespace: tokens.append(c) else: tokens[-1] += c prev_is_whitespace = False char_to_word_offset.append(len(tokens) - 1) # 匹配文档中的alias alias_spans = match_alias(line, trie, alias2entities) # 此时的span对应粗粒度的token,span_end对应alias的最后一个token alias_token_spans = [(char_to_word_offset[span[0]], char_to_word_offset[span[1] - 1]) for span in alias_spans] for span, token_span in zip(alias_spans, alias_token_spans): alias_tokens = ' '.join(tokens[token_span[0]:token_span[1] + 1]) alias_texts = line[span[0]:span[1]] assert alias_tokens in alias2entities, print( alias_tokens, token_span, alias_texts, span) # assert all(' '.join(tokens[span[0]: span[1] + 1]) in alias2entities for span in alias_token_spans), \ # print([' '.join(tokens[span[0]: span[1] + 1]) for span in alias_token_spans]) tok_to_orig_index = [] # 细粒度-粗粒度 orig_to_tok_index = [] # 粗粒度-细粒度 real_tokens = [] for (i, token) in enumerate(tokens): orig_to_tok_index.append(len(real_tokens)) sub_tokens = tokenizer.tokenize(token) for sub_token in sub_tokens: tok_to_orig_index.append(i) real_tokens.append(sub_token) # 判断当前span对应的粗粒度token是否为最后一个token, # 如果是的话,则取最后一个细粒度token为结尾,如果不是的话,取下一个粗粒度token对应的第一个细粒度token的前一个token为结尾。 real_alias_token_spans = [] for span in alias_token_spans: if span[1] == len(tokens) - 1: real_end = orig_to_tok_index[-1] else: real_end = orig_to_tok_index[span[1] + 1] - 1 real_start = orig_to_tok_index[span[0]] real_alias_token_spans.append((real_start, real_end)) # alias_token_spans = [(orig_to_tok_index[span[0]], orig_to_tok_index[span[1]]) # for span in alias_token_spans] all_documents.append(real_tokens) all_alias_token_spans.append(real_alias_token_spans) vocab_words = list(tokenizer.vocab.keys()) instances = [] for document_index in tqdm(range(len(all_documents)), total=len(all_documents), desc='creating instances'): instances.extend( create_instances_from_document(all_documents, document_index, all_alias_token_spans, max_seq_length, vocab_words, rng)) rng.shuffle(instances) return instances
def __init__(self): self.trie = SortedStringTrie()
class AutoComplete: MAXIMUM_NUM_CHARS = 10 NUM_OF_SUGGESTIONS = 5 def __init__(self): self.full_match = Trie() self.repairs_match = Trie() self.full_sentences = {} def prepare_full_match(self, folder_path): count = 0 for file in glob.glob(folder_path + "/**/*.txt", recursive=True): # os.listdir(folder_path): count += 1 print(count, file) with open(file, "r", encoding="utf8") as txt_file: for line_num, sen in enumerate(txt_file): if len(sen) > 1: self.process_line(file, sen, line_num) print("count", count) def process_line(self, file, sen, line_num): try: cleaned_line = AutoComplete.clean_line(sen) cleaned_line_in_words = cleaned_line.split() has_insert_flag = False for i in range(len(cleaned_line_in_words) - 1): tmp_line = " ".join(cleaned_line_in_words[i:]) if self.sentence_exist_for_prefix( sen, tmp_line[:AutoComplete.MAXIMUM_NUM_CHARS]): continue has_insert_flag = self.insert_full_match_sentence( file, tmp_line[:AutoComplete.MAXIMUM_NUM_CHARS], line_num) or has_insert_flag if has_insert_flag: self.full_sentences[(file, line_num)] = sen except: print("ERROR", sen) raise def sentence_exist_for_prefix(self, sen, prefix): try: for _tuple in self.full_match[prefix]: tmp_sen = self.full_sentences[_tuple] if sen.find(tmp_sen) > -1 or tmp_sen.find(sen) > -1: return True return False except: return False def insert_full_match_sentence(self, filename, cropped, line_num): try: sug_len = len(self.full_match[cropped]) except: self.full_match[cropped] = [] sug_len = 0 if sug_len < AutoComplete.NUM_OF_SUGGESTIONS: self.full_match[cropped].append((filename, line_num)) return True return False def __call__(self, prefix, mode="online"): clean_prefix = AutoComplete.clean_line(prefix) sugs = [] i = 0 # get data from full matched arr_of_tuples = self.full_match.items( prefix=clean_prefix)[:AutoComplete.NUM_OF_SUGGESTIONS] for arr in arr_of_tuples: for suggestion in arr[1]: sugs.append( self.get_auto_complete_data(suggestion, clean_prefix)) i += 1 if i >= AutoComplete.NUM_OF_SUGGESTIONS: return sugs print(len(sugs)) # get data from repairs trie - have sort #TODO - have to sort arr_of_tuples = self.repairs_match.items( prefix=clean_prefix)[:AutoComplete.NUM_OF_SUGGESTIONS] for arr in arr_of_tuples: for suggestion in arr[1]: sugs.append( self.get_auto_complete_data(suggestion, clean_prefix)) i += 1 if i >= AutoComplete.NUM_OF_SUGGESTIONS: return sugs print(len(sugs)) if i < AutoComplete.NUM_OF_SUGGESTIONS and mode == "prepare": repairs_sugs = self.predict_prefix_and_update_from_repairs_trie( clean_prefix, AutoComplete.NUM_OF_SUGGESTIONS - i) for repair in repairs_sugs: self.update_repair_sentence(clean_prefix, repair) sugs += repairs_sugs return sugs def update_repair_sentence(self, prefix, repair_data): try: sug_len = len(self.repairs_match[prefix]) except: self.repairs_match[prefix] = [] sug_len = 0 if sug_len < AutoComplete.NUM_OF_SUGGESTIONS: self.repairs_match[prefix].append( (repair_data.source_text, repair_data.line_num, repair_data.offset, repair_data.score)) # print(prefix, repair_data) def predict_prefix_and_update_from_repairs_trie(self, prefix, amount=NUM_OF_SUGGESTIONS, mode="offline"): options = get_mistakes_by_penalty(prefix) amount_found = 0 suggestions = [] for option, penalty in options: arr_of_tuples = self.full_match.items( prefix=option)[:AutoComplete.NUM_OF_SUGGESTIONS] for arr in arr_of_tuples: for suggestion in arr[1]: # if mode == "realtime": suggestions.append( self.get_auto_complete_data(suggestion, option, penalty)) amount_found += 1 if amount_found >= amount: return suggestions return suggestions @staticmethod def clean_line(sentence): sentence2 = re.sub(r'\W+', ' ', sentence.lower().strip()) sentence2 = re.sub(' +', ' ', sentence2) return sentence2 def get_auto_complete_data(self, file_line_tuple, clean_prefix, penalty=0, mode="full_match"): if mode == "full_match": filename, line_num = file_line_tuple full_sentence = self.full_sentences[file_line_tuple] score = 2 * len(clean_prefix) - penalty # TODO offset = AutoComplete.clean_line(full_sentence).find(clean_prefix) else: filename, line_num, offset, score = file_line_tuple full_sentence = self.full_sentences[(filename, line_num)] return AutoCompleteData(full_sentence, filename, offset, score, line_num) def observer_update_function(self): thread = Thread(target=self.save_to_pkl, args=(self, )) thread.start() def save_to_pkl(self, *args): now = datetime.now() # current date and time date_time = now.strftime("%d_%m_%Y__%H_%M_%S") with open( f"pkl_files_updated/auto_complete_with_repairs_{date_time}.pkl", "wb") as pkl_file: pickle.dump(self, pkl_file)
class TestTrie(unittest.TestCase): def setUp(self): """ 测试前准备环境的搭建(setUp) :return: """ self.words = 'an ant all allot alloy aloe are ate be'.split() self.trie = SortedStringTrie(zip(self.words, range(len(self.words)))) def test_longest_prefix(self): """ 定义了以'test'开头的方法, 即是一个测试用例 :return: """ self.assertEqual(self.trie.longest_prefix('antonym'), 'ant') self.assertEqual(self.trie.longest_prefix('are'), 'are') self.assertEqual(self.trie.longest_prefix('alla'), 'all') self.assertEqual(self.trie.longest_prefix('allo'), 'all') self.assertRaises(KeyError, self.trie.longest_prefix_item, 'alumni') self.assertEqual(self.trie.longest_prefix('alumni', default=None), None) self.assertEqual(self.trie.longest_prefix('linux', default=-1), -1) def test_longest_prefix_value(self): self.assertEqual(self.trie.longest_prefix_value('antonym'), 1) self.assertEqual(self.trie.longest_prefix_value('are'), 6) self.assertEqual(self.trie.longest_prefix_value('alla'), 2) self.assertEqual(self.trie.longest_prefix_value('allo'), 2) self.assertRaises(KeyError, self.trie.longest_prefix_value, 'alumni') self.assertEqual(self.trie.longest_prefix_value('alumni', default=None), None) self.assertEqual(self.trie.longest_prefix_value('linux', default=-1), -1) def test_longest_prefix_item(self): self.assertEqual(self.trie.longest_prefix_item('antonym'), ('ant', 1)) self.assertEqual(self.trie.longest_prefix_item('are'), ('are', 6)) self.assertEqual(self.trie.longest_prefix_item('alla'), ('all', 2)) self.assertEqual(self.trie.longest_prefix_item('allo'), ('all', 2)) self.assertRaises(KeyError, self.trie.longest_prefix_item, 'alumni') self.assertEqual(self.trie.longest_prefix_item('alumni', default=None), None) self.assertEqual(self.trie.longest_prefix_item('linux', default=-1), -1) def test_iter_prefixes(self): self.assertEqual(list(self.trie.iter_prefixes('antonym')), ['an', 'ant']) self.assertEqual(list(self.trie.iter_prefixes('are')), ['are']) self.assertEqual(list(self.trie.iter_prefixes('alumni')), []) def test_iter_prefix_values(self): self.assertEqual(list(self.trie.iter_prefix_values('antonym')), [0, 1]) self.assertEqual(list(self.trie.iter_prefix_values('are')), [6]) self.assertEqual(list(self.trie.iter_prefix_values('alumni')), []) def test_iter_prefix_items(self): self.assertEqual(list(self.trie.iter_prefix_items('antonym')), [('an', 0), ('ant', 1)]) self.assertEqual(list(self.trie.iter_prefix_items('are')), [('are', 6)]) self.assertEqual(list(self.trie.iter_prefix_items('alumni')), []) def test_keys_wprefix(self): self.assertEqual(self.trie.keys('al'), ['all', 'allot', 'alloy', 'aloe']) self.assertEqual(self.trie.keys('are'), ['are']) self.assertEqual(self.trie.keys('ann'), []) def test_values_wprefix(self): self.assertEqual(self.trie.values('al'), [2, 3, 4, 5]) self.assertEqual(self.trie.values('are'), [6]) self.assertEqual(self.trie.values('ann'), []) def test_items_wprefix(self): self.assertEqual(self.trie.items('al'), [('all', 2), ('allot', 3), ('alloy', 4), ('aloe', 5)]) self.assertEqual(self.trie.items('are'), [('are', 6)]) self.assertEqual(self.trie.items('ann'), []) def test_consistency_wprefix(self): trie = self.trie for prefix in 'al', 'are', 'ann': self.assertEqual( trie.items(prefix), list(zip(trie.keys(prefix), trie.values(prefix))) ) def test_empty_string(self): self.trie[''] = '!' self.assertEqual(self.trie.keys(''), ['', 'all', 'allot', 'alloy', 'aloe', 'an', 'ant', 'are', 'ate', 'be']) self.assertEqual(self.trie.values(''), ['!', 2, 3, 4, 5, 0, 1, 6, 7, 8]) self.assertEqual(self.trie.items(''), [('', '!'), ('all', 2), ('allot', 3), ('alloy', 4), ('aloe', 5), ('an', 0), ('ant', 1), ('are', 6), ('ate', 7), ('be', 8)]) self.assertEqual(list(self.trie.iter_prefixes('foo')), ['']) self.assertEqual(list(self.trie.iter_prefix_values('foo')), ['!']) self.assertEqual(list(self.trie.iter_prefix_items('foo')), [('', '!')]) self.assertEqual(self.trie.longest_prefix('foo'), '') self.assertEqual(self.trie.longest_prefix_value('foo'), '!') self.assertEqual(self.trie.longest_prefix_item('foo'), ('', '!')) def test_pickle(self): from pickle import dumps, loads, HIGHEST_PROTOCOL for proto in range(HIGHEST_PROTOCOL): unpickled = loads(dumps(self.trie, proto)) self.assertEqual(self.trie, unpickled) self.assertTrue(type(self.trie) is type(unpickled)) self.assertTrue(self.trie is not unpickled) def test_repr(self): evaled = eval(repr(self.trie)) self.assertEqual(evaled, self.trie) self.assertEqual(evaled.__class__, self.trie.__class__)