def test_build_trie_mks_min(self): fLOG(__file__, self._testMethodName, OutputPrint=__name__ == "__main__") queries = [(None, 'a'), (None, 'ab'), (None, 'abc'), (None, 'abcd')] trie = CompletionTrieNode.build(queries) gain = sum(len(w) - trie.min_keystroke(w)[0] for a, w in queries) self.assertEqual(gain, 0) for per in itertools.permutations(queries): trie = CompletionTrieNode.build(per) gain = sum(len(w) - trie.min_keystroke(w)[0] for a, w in per) fLOG(gain, per)
def test_build_trie_mks_min(self): fLOG( __file__, self._testMethodName, OutputPrint=__name__ == "__main__") queries = [(None, 'a'), (None, 'ab'), (None, 'abc'), (None, 'abcd')] trie = CompletionTrieNode.build(queries) gain = sum(len(w) - trie.min_keystroke(w)[0] for a, w in queries) self.assertEqual(gain, 0) for per in itertools.permutations(queries): trie = CompletionTrieNode.build(per) gain = sum(len(w) - trie.min_keystroke(w)[0] for a, w in per) fLOG(gain, per)
def test_permutations(self): fLOG( __file__, self._testMethodName, OutputPrint=__name__ == "__main__") queries = ['actuellement', 'actualité', 'actu'] weights = [1, 1, 0] for per in itertools.permutations(zip(queries, weights)): trie = CompletionTrieNode.build([(None, w) for w, p in per]) trie.precompute_stat() trie.update_stat_dynamic() fLOG("----", per) for n in trie.leaves(): fLOG(" ", n.value, n.stat.str_mks()) assert n.stat.mks1 <= n.stat.mks0 a = trie.min_dynamic_keystroke(n.value)[0] self.assertEqual(a, n.stat.mks1) a = trie.min_keystroke(n.value)[0] if a != n.stat.mks0: mes = [str(per)] for n2 in trie.leaves(): mes.append("{0} - {1} || {2}".format(n2.value, n2.stat.str_mks(), trie.min_keystroke(n2.value))) mes.append("---") for n2 in trie: mes.append("{0} || {1}".format( n2.value, n2.stat.str_mks())) for i, s in enumerate(n2.stat.completions): mes.append( " {0} - {1}:{2}".format(i, s[0], s[1].value)) raise Exception("difference\n{0}".format("\n".join(mes)))
def test_build_trie(self): fLOG( __file__, self._testMethodName, OutputPrint=__name__ == "__main__") queries = [(1, 'a'), (2, 'ab'), (3, 'abc'), (4, 'abcd'), (5, 'bc')] trie = CompletionTrieNode.build(queries) res = list(trie.items()) self.assertEqual(len(res), 2) res = list(trie.iter_leaves()) self.assertEqual( res, [(1, 'a'), (2, 'ab'), (3, 'abc'), (4, 'abcd'), (5, 'bc')]) lea = list(trie.leaves()) self.assertEqual(len(lea), 5) assert all(_.leave for _ in lea) node = trie.find('b') assert node is not None assert not node.leave node = trie.find('ab') assert node is not None assert node.leave self.assertEqual(node.value, 'ab') for _, word in queries: ks = trie.min_keystroke(word) self.assertEqual(ks[0], ks[1])
def test_completions(self): fLOG(__file__, self._testMethodName, OutputPrint=__name__ == "__main__") this = os.path.abspath(os.path.dirname(__file__)) data = os.path.join(this, "data", "sample300.txt") with open(data, "r", encoding="utf-8") as f: lines = [_.strip(" \n\r\t") for _ in f.readlines()] trie = CompletionTrieNode.build([(None, q) for q in lines]) trie.precompute_stat() trie.update_stat_dynamic() for q in lines: find = trie.find(q) assert find is not None sug = find.all_mks_completions() nb_ = [(a.value, len([s.value for _, s in b if s.value == q])) for a, b in sug] nb = sum(_[1] for _ in nb_) if nb == 0: info = "nb={0} q='{1}'".format(nb, q) st = find.stat.str_mks() text = find.str_all_completions() text2 = find.str_all_completions(use_precompute=False) raise Exception( "{4}\n---\nleave='{0}'\n{1}\n---\n{2}\n---\n{3}".format( find.value, st, text, text2, info))
def test_permutations(self): fLOG(__file__, self._testMethodName, OutputPrint=__name__ == "__main__") queries = ['actuellement', 'actualité', 'actu'] weights = [1, 1, 0] for per in itertools.permutations(zip(queries, weights)): trie = CompletionTrieNode.build([(None, w) for w, p in per]) trie.precompute_stat() trie.update_stat_dynamic() fLOG("----", per) for n in trie.leaves(): fLOG(" ", n.value, n.stat.str_mks()) assert n.stat.mks1 <= n.stat.mks0 a = trie.min_dynamic_keystroke(n.value)[0] self.assertEqual(a, n.stat.mks1) a = trie.min_keystroke(n.value)[0] if a != n.stat.mks0: mes = [str(per)] for n2 in trie.leaves(): mes.append("{0} - {1} || {2}".format( n2.value, n2.stat.str_mks(), trie.min_keystroke(n2.value))) mes.append("---") for n2 in trie: mes.append("{0} || {1}".format(n2.value, n2.stat.str_mks())) for i, s in enumerate(n2.stat.completions): mes.append(" {0} - {1}:{2}".format( i, s[0], s[1].value)) raise Exception("difference\n{0}".format("\n".join(mes)))
def test_build_trie(self): fLOG(__file__, self._testMethodName, OutputPrint=__name__ == "__main__") queries = [(1, 'a'), (2, 'ab'), (3, 'abc'), (4, 'abcd'), (5, 'bc')] trie = CompletionTrieNode.build(queries) res = list(trie.items()) self.assertEqual(len(res), 2) res = list(trie.iter_leaves()) self.assertEqual(res, [(1, 'a'), (2, 'ab'), (3, 'abc'), (4, 'abcd'), (5, 'bc')]) lea = list(trie.leaves()) self.assertEqual(len(lea), 5) assert all(_.leave for _ in lea) node = trie.find('b') assert node is not None assert not node.leave node = trie.find('ab') assert node is not None assert node.leave self.assertEqual(node.value, 'ab') for _, word in queries: ks = trie.min_keystroke(word) self.assertEqual(ks[0], ks[1])
def test_mks_consistency(self): fLOG(__file__, self._testMethodName, OutputPrint=__name__ == "__main__") def cmks(trie): trie.precompute_stat() trie.update_stat_dynamic() gmks = 0.0 gmksd = 0.0 gmksd2 = 0.0 nb = 0 size = 0 for n in trie.leaves(): if (True and gmksd2 > gmksd) or \ (n.value == "baaaab" and n.stat.mks1 != 4): info = n.str_all_completions() info2 = n.str_all_completions(use_precompute=True) raise Exception( "issue with query '{0}'\n{1}\n##########\n{2}\n############\n{3}" .format(n.value, n.stat.str_mks(), info, info2)) gmks += len(n.value) - n.stat.mks0 gmksd += len(n.value) - n.stat.mks1 gmksd2 += len(n.value) - n.stat.mks2 size += len(n.value) nb += 1 return nb, gmks, gmksd, gmksd2, size def gain_dynamique_moyen_par_mot(queries, weights): per = list(zip(weights, queries)) total = sum(w * len(q) for q, w in zip(queries, weights)) trie = CompletionTrieNode.build([(None, q) for _, q in per]) trie.precompute_stat() trie.update_stat_dynamic() wks = [(w, p, len(w) - trie.min_keystroke0(w)[0]) for p, w in per] wks_dyn = [(w, p, len(w) - trie.min_dynamic_keystroke(w)[0]) for p, w in per] wks_dyn2 = [(w, p, len(w) - trie.min_dynamic_keystroke2(w)[0]) for p, w in per] gain = sum(g * p / total for w, p, g in wks) gain_dyn = sum(g * p / total for w, p, g in wks_dyn) gain_dyn2 = sum(g * p / total for w, p, g in wks_dyn2) ave_length = sum(len(w) * p / total for p, w in per) return gain, gain_dyn, gain_dyn2, ave_length this = os.path.abspath( os.path.join(os.path.dirname(__file__), "data", "sample_alpha_2.txt")) with open(this, "r", encoding="utf-8") as f: titles = [_.strip(" \n\r\t") for _ in f.readlines()] fLOG(titles[:5]) trie = CompletionTrieNode.build([(None, q) for q in titles]) nb, gmks, gmksd, gmksd2, size = cmks(trie) gain, gain_dyn, gain_dyn2, ave_length = gain_dynamique_moyen_par_mot( titles, [1.0] * len(titles)) fLOG("***", 1, nb, size, "*", gmks / size, gmksd / size, gmksd2 / size) fLOG("***", gain, gain_dyn, gain_dyn2, ave_length) self.assertEqual(nb, 494)
def test_build_dynamic_trie_mks_min(self): fLOG(__file__, self._testMethodName, OutputPrint=__name__ == "__main__") queries = [(None, 'a'), (None, 'ab'), (None, 'abc'), (None, 'abcd')] trie = CompletionTrieNode.build(queries) trie.precompute_stat() trie.update_stat_dynamic() for leave in trie.leaves(): if leave.stat is None: raise Exception("None for {0}".format(leave)) find = trie.find(leave.value) self.assertEqual(id(find), id(leave)) assert hasattr(leave, "stat") assert hasattr(leave.stat, "mks0") assert hasattr(leave.stat, "mks1") mk1 = trie.min_keystroke(leave.value) try: mk = trie.min_dynamic_keystroke(leave.value) mk2 = trie.min_dynamic_keystroke2(leave.value) except Exception as e: raise Exception("{0}-{1}-{2}-{3}".format( id(trie), id(leave), str(leave), leave.leave)) from e if mk[0] > mk1[0]: raise Exception("weird {0} > {1}".format(mk, mk1)) if mk2[0] < mk[0]: raise Exception("weird {0} > {1}".format(mk, mk2)) fLOG(leave.value, mk, "-", leave.stat.str_mks()) self.assertEqual( mk, (leave.stat.mks0, leave.stat.mks0_, leave.stat.mks1i_)) text = leave.str_all_completions() assert text text = leave.str_all_completions(use_precompute=False) assert text
def test_completions(self): fLOG( __file__, self._testMethodName, OutputPrint=__name__ == "__main__") this = os.path.abspath(os.path.dirname(__file__)) data = os.path.join(this, "data", "sample300.txt") with open(data, "r", encoding="utf-8") as f: lines = [_.strip(" \n\r\t") for _ in f.readlines()] trie = CompletionTrieNode.build([(None, q) for q in lines]) trie.precompute_stat() trie.update_stat_dynamic() for q in lines: find = trie.find(q) assert find is not None sug = find.all_mks_completions() nb_ = [(a.value, len([s.value for _, s in b if s.value == q])) for a, b in sug] nb = sum(_[1] for _ in nb_) if nb == 0: info = "nb={0} q='{1}'".format(nb, q) st = find.stat.str_mks() text = find.str_all_completions() text2 = find.str_all_completions(use_precompute=False) raise Exception( "{4}\n---\nleave='{0}'\n{1}\n---\n{2}\n---\n{3}".format(find.value, st, text, text2, info))
def test_mks_consistency(self): fLOG( __file__, self._testMethodName, OutputPrint=__name__ == "__main__") def cmks(trie): trie.precompute_stat() trie.update_stat_dynamic() gmks = 0.0 gmksd = 0.0 gmksd2 = 0.0 nb = 0 size = 0 for n in trie.leaves(): if (True and gmksd2 > gmksd) or \ (n.value == "baaaab" and n.stat.mks1 != 4): info = n.str_all_completions() info2 = n.str_all_completions(use_precompute=True) raise Exception("issue with query '{0}'\n{1}\n##########\n{2}\n############\n{3}".format( n.value, n.stat.str_mks(), info, info2)) gmks += len(n.value) - n.stat.mks0 gmksd += len(n.value) - n.stat.mks1 gmksd2 += len(n.value) - n.stat.mks2 size += len(n.value) nb += 1 return nb, gmks, gmksd, gmksd2, size def gain_dynamique_moyen_par_mot(queries, weights): per = list(zip(weights, queries)) total = sum(w * len(q) for q, w in zip(queries, weights)) trie = CompletionTrieNode.build([(None, q) for _, q in per]) trie.precompute_stat() trie.update_stat_dynamic() wks = [(w, p, len(w) - trie.min_keystroke0(w)[0]) for p, w in per] wks_dyn = [(w, p, len(w) - trie.min_dynamic_keystroke(w)[0]) for p, w in per] wks_dyn2 = [(w, p, len(w) - trie.min_dynamic_keystroke2(w)[0]) for p, w in per] gain = sum(g * p / total for w, p, g in wks) gain_dyn = sum(g * p / total for w, p, g in wks_dyn) gain_dyn2 = sum(g * p / total for w, p, g in wks_dyn2) ave_length = sum(len(w) * p / total for p, w in per) return gain, gain_dyn, gain_dyn2, ave_length this = os.path.abspath(os.path.join( os.path.dirname(__file__), "data", "sample_alpha_2.txt")) with open(this, "r", encoding="utf-8") as f: titles = [_.strip(" \n\r\t") for _ in f.readlines()] fLOG(titles[:5]) trie = CompletionTrieNode.build([(None, q) for q in titles]) nb, gmks, gmksd, gmksd2, size = cmks(trie) gain, gain_dyn, gain_dyn2, ave_length = gain_dynamique_moyen_par_mot(titles, [ 1.0] * len(titles)) fLOG("***", 1, nb, size, "*", gmks / size, gmksd / size, gmksd2 / size) fLOG("***", gain, gain_dyn, gain_dyn2, ave_length) self.assertEqual(nb, 494)
def test_mks_consistency(self): fLOG( __file__, self._testMethodName, OutputPrint=__name__ == "__main__") def cmks(trie): trie.precompute_stat() trie.update_stat_dynamic() gmks = 0.0 gmksd = 0.0 nb = 0 size = 0 for n in trie.leaves(): gmks += len(n.value) - n.stat.mks0 gmksd += len(n.value) - n.stat.mks1 size += len(n.value) nb += 1 return nb, gmks, gmksd, size titles = [(None, '"contra el gang del chicharron"', '"Contra el gang del chicharron')] trie = CompletionTrieNode.build(titles) nb, gmks, gmksd, size = cmks(trie) fLOG("***", 1, nb, size, gmks / nb, gmksd / nb, gmks / size, gmksd / size, gmks) self.assertEqual(gmks, 30) titles.append((None, '"la sequestree"', '"La séquestrée')) trie = CompletionTrieNode.build(titles) nb, gmks, gmksd, size = cmks(trie) fLOG("***", 2, nb, size, gmks / nb, gmksd / nb, gmks / size, gmksd / size, gmks) for n in trie.leaves(): fLOG("***", n.value, n.stat.str_mks()) self.assertEqual(gmks, 43)
def test_mks_consistency(self): fLOG(__file__, self._testMethodName, OutputPrint=__name__ == "__main__") def cmks(trie): trie.precompute_stat() trie.update_stat_dynamic() gmks = 0.0 gmksd = 0.0 nb = 0 size = 0 for n in trie.leaves(): gmks += len(n.value) - n.stat.mks0 gmksd += len(n.value) - n.stat.mks1 size += len(n.value) nb += 1 return nb, gmks, gmksd, size titles = [(None, '"contra el gang del chicharron"', '"Contra el gang del chicharron')] trie = CompletionTrieNode.build(titles) nb, gmks, gmksd, size = cmks(trie) fLOG("***", 1, nb, size, gmks / nb, gmksd / nb, gmks / size, gmksd / size, gmks) self.assertEqual(gmks, 30) titles.append((None, '"la sequestree"', '"La séquestrée')) trie = CompletionTrieNode.build(titles) nb, gmks, gmksd, size = cmks(trie) fLOG("***", 2, nb, size, gmks / nb, gmksd / nb, gmks / size, gmksd / size, gmks) for n in trie.leaves(): fLOG("***", n.value, n.stat.str_mks()) self.assertEqual(gmks, 43)
def gain_dynamique_moyen_par_mot(self, queries, weights): per = list(zip(weights, queries)) total = sum(weights) * 1.0 trie = CompletionTrieNode.build([(None, q) for _, q in per]) trie.precompute_stat() trie.update_stat_dynamic() wks = [(w, p, len(w) - trie.min_keystroke0(w)[0]) for p, w in per] wks_dyn = [(w, p, len(w) - trie.min_dynamic_keystroke(w)[0]) for p, w in per] wks_dyn2 = [(w, p, len(w) - trie.min_dynamic_keystroke2(w)[0]) for p, w in per] gain = sum(g * p / total for w, p, g in wks) gain_dyn = sum(g * p / total for w, p, g in wks_dyn) gain_dyn2 = sum(g * p / total for w, p, g in wks_dyn2) ave_length = sum(len(w) * p / total for p, w in per) return gain, gain_dyn, gain_dyn2, ave_length
def test_check_bug_about_mergeing_completions(self): fLOG( __file__, self._testMethodName, OutputPrint=__name__ == "__main__") data = os.path.join(os.path.abspath( os.path.dirname(__file__)), "data", "sample20000.txt") with open(data, "r", encoding="utf-8") as f: lines = [_.strip("\n\r\t ") for _ in f.readlines()] queries = [(None, _) for _ in lines] fLOG("build trie") trie = CompletionTrieNode.build(queries) fLOG(len(queries), len(set(_[1] for _ in queries)), len(list(trie.leaves())), len(set(trie.leaves()))) assert "Cannes 2005" in set(_[1] for _ in queries) assert "Cannes 2005" in set(_.value for _ in trie.leaves()) fLOG("bug precompute") trie.precompute_stat() fLOG("bug checking") find = trie.find('Cann') sug = find.stat.completions self.assertEqual(len(sug), 2) leave = trie.find('Cannes 2005') sugg = leave.all_mks_completions() assert len(sugg) > 0 verif = 0 for p, sug in sugg: if p.value.startswith("Cannes"): for s in sug: if s[1].value == "Cannes 2005": verif += 1 if verif == 0: raise Exception(leave.str_all_completions(use_precompute=True)) sugg = leave.all_completions() assert len(sugg) > 0 verif = 0 for p, sug in sugg: if p.value.startswith("Cannes"): for s in sug: if s == "Cannes 2005": verif += 1 if verif == 0: raise Exception(leave.str_all_completions(use_precompute=False))
def test_duplicates(self): fLOG(__file__, self._testMethodName, OutputPrint=__name__ == "__main__") titles = ["abdcf", "abdcf"] try: fLOG(titles) trie = CompletionTrieNode.build([ (None, remove_diacritics(w).lower(), w) for w in titles ]) fLOG(trie) le = list(trie) assert len(le) == 6 assert trie is not None except ValueError as e: fLOG(e)
def test_duplicates(self): fLOG( __file__, self._testMethodName, OutputPrint=__name__ == "__main__") titles = ["abdcf", "abdcf"] try: fLOG(titles) trie = CompletionTrieNode.build( [(None, remove_diacritics(w).lower(), w) for w in titles]) fLOG(trie) le = list(trie) assert len(le) == 6 assert trie is not None except ValueError as e: fLOG(e)
def test_check_bug_about_mergeing_completions(self): fLOG(__file__, self._testMethodName, OutputPrint=__name__ == "__main__") data = os.path.join(os.path.abspath(os.path.dirname(__file__)), "data", "sample20000.txt") with open(data, "r", encoding="utf-8") as f: lines = [_.strip("\n\r\t ") for _ in f.readlines()] queries = [(None, _) for _ in lines] fLOG("build trie") trie = CompletionTrieNode.build(queries) fLOG(len(queries), len(set(_[1] for _ in queries)), len(list(trie.leaves())), len(set(trie.leaves()))) assert "Cannes 2005" in set(_[1] for _ in queries) assert "Cannes 2005" in set(_.value for _ in trie.leaves()) fLOG("bug precompute") trie.precompute_stat() fLOG("bug checking") find = trie.find('Cann') sug = find.stat.completions self.assertEqual(len(sug), 2) leave = trie.find('Cannes 2005') sugg = leave.all_mks_completions() assert len(sugg) > 0 verif = 0 for p, sug in sugg: if p.value.startswith("Cannes"): for s in sug: if s[1].value == "Cannes 2005": verif += 1 if verif == 0: raise Exception(leave.str_all_completions(use_precompute=True)) sugg = leave.all_completions() assert len(sugg) > 0 verif = 0 for p, sug in sugg: if p.value.startswith("Cannes"): for s in sug: if s == "Cannes 2005": verif += 1 if verif == 0: raise Exception(leave.str_all_completions(use_precompute=False))
def test_build_trie_mks(self): fLOG( __file__, self._testMethodName, OutputPrint=__name__ == "__main__") queries = [(4, 'a'), (2, 'ab'), (3, 'abc'), (1, 'abcd')] trie = CompletionTrieNode.build(queries) nodes = trie.items_list() st = [str(_) for _ in nodes] fLOG(st) self.assertEqual( st, ['[-::w=1]', '[#:a:w=4]', '[#:ab:w=2]', '[#:abc:w=3]', '[#:abcd:w=1]']) find = trie.find('a') assert find ms = [(word, trie.min_keystroke(word)) for k, word in queries] self.assertEqual(ms, [('a', (1, 1)), ('ab', (2, 2)), ('abc', (3, 3)), ('abcd', (1, 0))])
def test_build_trie_mks(self): fLOG(__file__, self._testMethodName, OutputPrint=__name__ == "__main__") queries = [(4, 'a'), (2, 'ab'), (3, 'abc'), (1, 'abcd')] trie = CompletionTrieNode.build(queries) nodes = trie.items_list() st = [str(_) for _ in nodes] fLOG(st) self.assertEqual(st, [ '[-::w=1]', '[#:a:w=4]', '[#:ab:w=2]', '[#:abc:w=3]', '[#:abcd:w=1]' ]) find = trie.find('a') assert find ms = [(word, trie.min_keystroke(word)) for k, word in queries] self.assertEqual(ms, [('a', (1, 1)), ('ab', (2, 2)), ('abc', (3, 3)), ('abcd', (1, 0))])
def test_build_dynamic_trie_mks_min(self): fLOG( __file__, self._testMethodName, OutputPrint=__name__ == "__main__") queries = [(None, 'a'), (None, 'ab'), (None, 'abc'), (None, 'abcd')] trie = CompletionTrieNode.build(queries) trie.precompute_stat() trie.update_stat_dynamic() for leave in trie.leaves(): if leave.stat is None: raise Exception("None for {0}".format(leave)) find = trie.find(leave.value) self.assertEqual(id(find), id(leave)) assert hasattr(leave, "stat") assert hasattr(leave.stat, "mks0") assert hasattr(leave.stat, "mks1") mk1 = trie.min_keystroke(leave.value) try: mk = trie.min_dynamic_keystroke(leave.value) mk2 = trie.min_dynamic_keystroke2(leave.value) except Exception as e: raise Exception( "{0}-{1}-{2}-{3}".format(id(trie), id(leave), str(leave), leave.leave)) from e if mk[0] > mk1[0]: raise Exception("weird {0} > {1}".format(mk, mk1)) if mk2[0] < mk[0]: raise Exception("weird {0} > {1}".format(mk, mk2)) fLOG(leave.value, mk, "-", leave.stat.str_mks()) self.assertEqual( mk, (leave.stat.mks0, leave.stat.mks0_, leave.stat.mks1i_)) text = leave.str_all_completions() assert text text = leave.str_all_completions(use_precompute=False) assert text
def test_load_titles(self): fLOG(__file__, self._testMethodName, OutputPrint=__name__ == "__main__") this = os.path.abspath(os.path.dirname(__file__)) this = os.path.join(this, "data", "wikititles.txt") titles = sorted(enumerate_titles(this)) res = {} dups = 0 for w in titles: wc = remove_diacritics(w).lower() if wc not in res: res[wc] = w else: fLOG("duplicated key: '{0}', '{1}', key: '{2}'".format( w, res[wc], wc)) dups += 1 fLOG("len(titles)=", len(res), "duplicated", dups) titles = list(sorted((None, k, v) for k, v in res.items())) self.assertEqual(titles[-1], (None, 'grand russe', 'Grand Russe')) self.assertEqual(titles[-2], (None, 'grand rue de pera', 'Grand Rue de Pera')) trie = CompletionTrieNode.build(titles) nodes = list(trie) exp_value = '[-:":w=0]' if str(nodes[1]) != exp_value: lines = "\n".join(str(_) for _ in nodes[:5]) lines2 = "\n".join(str(_) for _ in titles[:5]) info = ";".join(k for k, v in sorted(trie.children.items())) raise Exception("{0} != {1}\n{2}\nTITLES\n{3}\nINFO\n{4}".format( str(nodes[1]), exp_value, lines, lines2, info)) if str(nodes[-1]) != "[#:grand russe:w=354]": lines = "\n".join(str(_) for _ in nodes[-5:]) lines2 = "\n".join(str(_) for _ in titles[-5:]) raise Exception("{0} != {1}\n{2}\nTITLES\n{3}".format( str(nodes[-1]), "[#:grand russe:w=354]", lines, lines2)) self.assertEqual(len(nodes), 3753) def cmks(trie): trie.precompute_stat() trie.update_stat_dynamic() gmks = 0.0 gmksd = 0.0 nb = 0 size = 0 for n in trie.leaves(): gmks += len(n.value) - n.stat.mks0 gmksd += len(n.value) - n.stat.mks1 size += len(n.value) nb += 1 return nb, gmks, gmksd, size nb, gmks, gmksd, size = cmks(trie) fLOG(nb, size, gmks / nb, gmksd / nb, gmks / size, gmksd / size) if gmks > gmksd: raise Exception("gmks={0} gmksd={1}".format(gmks, gmksd)) if gmksd == 0: i = 0 for node in trie: fLOG(node.value, "--", node.stat.str_mks()) if i > 20: break i += 1 assert False trie = CompletionTrieNode.build(titles) nb2, gmks2, gmksd2, size = cmks(trie) self.assertEqual(nb, nb2) self.assertEqual(gmks, gmks2) self.assertEqual(gmksd, gmksd2) assert gmksd > 0.62 fLOG(nb2, gmks2 / nb2, gmksd2 / nb2) fLOG("-----") for i in range(1, 20): trie = CompletionTrieNode.build(titles[:i]) nb, gmks, gmksd, size = cmks(trie) if i == 1: self.assertEqual(gmks, 30) fLOG(i, nb, size, gmks / nb, gmksd / nb, gmks / size, gmksd / size, gmks)
def test_load_titles(self): fLOG( __file__, self._testMethodName, OutputPrint=__name__ == "__main__") this = os.path.abspath(os.path.dirname(__file__)) this = os.path.join(this, "data", "wikititles.txt") titles = sorted(enumerate_titles(this)) res = {} dups = 0 for w in titles: wc = remove_diacritics(w).lower() if wc not in res: res[wc] = w else: fLOG("duplicated key: '{0}', '{1}', key: '{2}'".format( w, res[wc], wc)) dups += 1 fLOG("len(titles)=", len(res), "duplicated", dups) titles = list(sorted((None, k, v) for k, v in res.items())) self.assertEqual(titles[-1], (None, 'grand russe', 'Grand Russe')) self.assertEqual( titles[-2], (None, 'grand rue de pera', 'Grand Rue de Pera')) trie = CompletionTrieNode.build(titles) nodes = list(trie) exp_value = '[-:":w=0]' if str(nodes[1]) != exp_value: lines = "\n".join(str(_) for _ in nodes[:5]) lines2 = "\n".join(str(_) for _ in titles[:5]) info = ";".join(k for k, v in sorted(trie.children.items())) raise Exception("{0} != {1}\n{2}\nTITLES\n{3}\nINFO\n{4}".format( str(nodes[1]), exp_value, lines, lines2, info)) if str(nodes[-1]) != "[#:grand russe:w=354]": lines = "\n".join(str(_) for _ in nodes[-5:]) lines2 = "\n".join(str(_) for _ in titles[-5:]) raise Exception("{0} != {1}\n{2}\nTITLES\n{3}".format( str(nodes[-1]), "[#:grand russe:w=354]", lines, lines2)) self.assertEqual(len(nodes), 3753) def cmks(trie): trie.precompute_stat() trie.update_stat_dynamic() gmks = 0.0 gmksd = 0.0 nb = 0 size = 0 for n in trie.leaves(): gmks += len(n.value) - n.stat.mks0 gmksd += len(n.value) - n.stat.mks1 size += len(n.value) nb += 1 return nb, gmks, gmksd, size nb, gmks, gmksd, size = cmks(trie) fLOG(nb, size, gmks / nb, gmksd / nb, gmks / size, gmksd / size) if gmks > gmksd: raise Exception("gmks={0} gmksd={1}".format(gmks, gmksd)) if gmksd == 0: i = 0 for node in trie: fLOG(node.value, "--", node.stat.str_mks()) if i > 20: break i += 1 assert False trie = CompletionTrieNode.build(titles) nb2, gmks2, gmksd2, size = cmks(trie) self.assertEqual(nb, nb2) self.assertEqual(gmks, gmks2) self.assertEqual(gmksd, gmksd2) assert gmksd > 0.62 fLOG(nb2, gmks2 / nb2, gmksd2 / nb2) fLOG("-----") for i in range(1, 20): trie = CompletionTrieNode.build(titles[:i]) nb, gmks, gmksd, size = cmks(trie) if i == 1: self.assertEqual(gmks, 30) fLOG(i, nb, size, gmks / nb, gmksd / nb, gmks / size, gmksd / size, gmks)
def test_build_dynamic_trie_mks_min(self): fLOG( __file__, self._testMethodName, OutputPrint=__name__ == "__main__") data = os.path.join(os.path.abspath( os.path.dirname(__file__)), "data", "sample20000.txt") with open(data, "r", encoding="utf-8") as f: lines = [_.strip("\n\r\t ") for _ in f.readlines()] queries = [(None, _) for _ in lines] temp = get_temp_folder(__file__, "temp_build_dynamic_trie_mks_min") clog = CustomLog(temp) clog("build trie") trie = CompletionTrieNode.build(queries) fLOG(len(queries), len(set(_[1] for _ in queries)), len(list(trie.leaves())), len(set(trie.leaves()))) self.assertTrue("Cannes 2005" in set(_[1] for _ in queries)) self.assertTrue("Cannes 2005" in set(_.value for _ in trie.leaves())) clog("precompute") trie.precompute_stat() clog("update") trie.update_stat_dynamic() clog("loop") fLOG("loop") for i, q in enumerate(queries): if i % 1000 == 0: clog(i) fLOG(i) leave = trie.find(q[1]) if leave.stat is None: raise Exception("None for {0}".format(leave)) self.assertTrue(hasattr(leave, "stat")) self.assertTrue(hasattr(leave.stat, "mks0")) self.assertTrue(hasattr(leave.stat, "mks1")) sug = leave.all_mks_completions() nb_ = [(a.value, len([s.value for _, s in b if s.value == q[1]])) for a, b in sug] nbf_ = [(a.value, len(b)) for a, b in sug] nb = sum(_[1] for _ in nb_) mnb = max(_[1] for _ in nbf_) if nb == 0 and len(q[1]) > 10: info = "nb={0} mnb={2} q='{1}'".format(nb, q[1], mnb) st = leave.stat.str_mks() text = leave.str_all_completions() text2 = leave.str_all_completions(use_precompute=False) raise Exception( "{4}\n---\nleave='{0}'\n{1}\n---\n{2}\n---\n{3}".format(leave.value, st, text, text2, info)) mk1 = trie.min_keystroke0(leave.value) try: mk = trie.min_dynamic_keystroke(leave.value) mk2 = trie.min_dynamic_keystroke2(leave.value) except Exception as e: raise Exception( "{0}-{1}-{2}-{3}".format(id(trie), id(leave), str(leave), leave.leave)) from e if mk[0] > mk1[0]: st = leave.stat.str_mks() text = leave.str_all_completions() text2 = leave.str_all_completions(use_precompute=False) raise Exception("weird {0} > {1} -- leave='{2}'\n{3}\n---\n{4}\n---\n{5}".format( mk, mk1, leave.value, st, text, text2)) if mk2[0] < mk[0]: st = leave.stat.str_mks() text = leave.str_all_completions() text2 = leave.str_all_completions(use_precompute=False) raise Exception("weird {0} > {1} -- leave='{2}'\n{3}\n---\n{4}\n---\n{5}".format( mk, mk2, leave.value, st, text, text2)) clog("end") fLOG("end")