def test_hunspell_compatibility( self ): """ test_hunspell_compatibility """ from pyspell import speller s = speller( self.aff_file(), self.dic_file(), ) s.init() bad = set() with codecs.open( self.file("hunspell_results/wiki-words.txt"), mode="r+", encoding="utf-8" ) as fin_words: for word in fin_words: word = word.strip() accepted = s.check(word) if accepted is None: bad.add(word) bad_expected = set() with codecs.open( self.file("hunspell_results/wiki-words.txt.bad.results"), mode="r+", encoding="utf-8" ) as fin_results: for word in fin_results: bad_expected.add(word.strip()) diff1 = bad_expected - bad print diff1 self.assertTrue(0 == len(diff1)) diff2 = bad - bad_expected print diff2 self.assertTrue(0 == len(diff2))
def test_hunspell_compatibility(self): """ test_hunspell_compatibility """ from pyspell import speller s = speller( self.aff_file(), self.dic_file(), ) s.init() bad = set() with codecs.open(self.file("hunspell_results/wiki-words.txt"), mode="r+", encoding="utf-8") as fin_words: for word in fin_words: word = word.strip() accepted = s.check(word) if accepted is None: bad.add(word) bad_expected = set() with codecs.open( self.file("hunspell_results/wiki-words.txt.bad.results"), mode="r+", encoding="utf-8") as fin_results: for word in fin_results: bad_expected.add(word.strip()) diff1 = bad_expected - bad print diff1 self.assertTrue(0 == len(diff1)) diff2 = bad - bad_expected print diff2 self.assertTrue(0 == len(diff2))
def morpho_parse(env): """ Gather most used words according to a specific definition Note: not tested with larger wikis! """ import glob from simplemorpho import morpho, word_forms input_glob = os.path.join( env["start_dir"], env["input"]["dir"], env["input"]["morpho_glob"] ) for f in glob.glob(input_glob): _logger.info(u"Working on [%s]", f) m = morpho(f) max_show = 0 m.parse(all_forms=True, max_process=max_show) ## if True: sys.path.insert(0, os.path.join(env["start_dir"], env["src_dir"])) dictionaries = env["input"]["dictionaries"] aff_file = os.path.join(env["start_dir"], env["input"]["dir"], dictionaries + ".aff") dic_file = os.path.join(env["start_dir"], env["input"]["dir"], dictionaries + ".dic") from pyspell import speller s = speller(aff_file, dic_file) s.init() pos = 0 for k in s._dic._d.keys(): if k.lower() not in m.all_forms(): _logger.info(u"Word from .dic not found in ma [%s]", k) pos += 1 print "Not found words [%d out of %d]" % (pos, len(s._dic._d)) ## uniq_rules = set() uniq_rules_right = set() for pos, (k, v) in enumerate(m.forms().iteritems()): if 0 < max_show < pos: break #print u"%s: %s" % (k, u",".join(v.forms())) #print v.rules() r_strs = word_forms.rule_strs(v.rules()) uniq_rules |= set(r_strs) uniq_rules_right |= set([x.split("->")[0] for x in r_strs]) msg = "All rules [%d], unique rules [%d], unique rules right [%d]" % ( pos, len(uniq_rules), len(uniq_rules_right) ) print msg for pos, r in enumerate(sorted(uniq_rules)): if 1000 < pos: break print r print msg
def test_arbitrary_affix(self): """ test_arbitrary_affix """ from pyspell import speller s = speller( self.aff_file("small"), self.dic_file("small"), ) s.init() for w, flag, expected in ( ("ammm", False, False), ("am", False, False), ): self.assertEqual(expected, s.check(w, flag) is not None)
def test_arbitrary_affix( self ): """ test_arbitrary_affix """ from pyspell import speller s = speller( self.aff_file("small"), self.dic_file("small"), ) s.init() for w, flag, expected in ( ("ammm", False, False), ("am", False, False), ): self.assertEqual(expected, s.check(w, flag) is not None)
def test_text( self ): """ test_text """ from pyspell import speller s = speller( self.aff_file("mini"), self.dic_file("mini"), ) s.init() with codecs.open(self.text_file("mini"), mode="r+", encoding="utf-8") as fin: guru_accepted = None for l in fin: for w in l.split(): self.log(u"Testing [%s]" % w) accepted = s.check(w) if guru_accepted is None: guru_accepted = accepted self.assertIsNotNone(accepted) self.log(u"+-accepted [%s]" % accepted) self.assertEqual(accepted, guru_accepted)
def test_spell_origin(self): """ test_spell_origin """ from pyspell import speller from pyspell._utils import line_strip s = speller( self.aff_file(), self.dic_file(), ) s.init() with codecs.open(self.dic_file(), mode="r+", encoding="utf-8") as fin: fin.next() for i, l in enumerate(fin): w = line_strip(l).split("/")[0] # errors in dict if " " in w: continue self.assertTrue(s.check(w)) self.assertFalse(s.check(w + "ehmmm")) if 0 == (i + 1) % 10000: self.log("done [%d]" % i)
def test_spell_origin( self ): """ test_spell_origin """ from pyspell import speller from pyspell._utils import line_strip s = speller( self.aff_file(), self.dic_file(), ) s.init() with codecs.open(self.dic_file(), mode="r+", encoding="utf-8") as fin: fin.next() for i, l in enumerate(fin): w = line_strip(l).split("/")[0] # errors in dict if " " in w: continue self.assertTrue(s.check(w)) self.assertFalse(s.check(w + "ehmmm")) if 0 == (i + 1) % 10000: self.log("done [%d]" % i)
def test_text(self): """ test_text """ from pyspell import speller s = speller( self.aff_file("mini"), self.dic_file("mini"), ) s.init() with codecs.open(self.text_file("mini"), mode="r+", encoding="utf-8") as fin: guru_accepted = None for l in fin: for w in l.split(): self.log(u"Testing [%s]" % w) accepted = s.check(w) if guru_accepted is None: guru_accepted = accepted self.assertIsNotNone(accepted) self.log(u"+-accepted [%s]" % accepted) self.assertEqual(accepted, guru_accepted)
def test_ignorecase_text(self): """ test_ignorecase_text """ from pyspell import speller s = speller( self.aff_file("small"), self.dic_file("small"), ) s.init() for w, flag, expected in ( (u"Abcházska", True, True), (u"abcházska", True, True), (u"Abcházsko", True, True), (u"abcházsko", True, True), ("Bratislava", False, True), ("Bratislave", False, True), ("Bratislavy", False, True), ("bratislava", False, False), ("bratislave", False, False), ("bratislavy", False, False), ): self.assertEqual(expected, s.check(w, flag) is not None)
def test_ignorecase_text( self ): """ test_ignorecase_text """ from pyspell import speller s = speller( self.aff_file("small"), self.dic_file("small"), ) s.init() for w, flag, expected in ( (u"Abcházska", True, True), (u"abcházska", True, True), (u"Abcházsko", True, True), (u"abcházsko", True, True), ("Bratislava", False, True), ("Bratislave", False, True), ("Bratislavy", False, True), ("bratislava", False, False), ("bratislave", False, False), ("bratislavy", False, False), ): self.assertEqual(expected, s.check(w, flag) is not None)
def unknown_from_wiki(env): """ How many words do we know from a list of most used ones? """ sys.path.insert(0, os.path.join(env["start_dir"], env["src_dir"])) def _progress(cnt, cnt_nf, cnt_nf_f_cap, time_arr): time_arr.append(time.time()) return "in [%.2fs] .. done [%8d] words ... [%5d][%.2f%%] not found ... " \ "[%5d][%.2f%%] not found lower" % ( (time_arr[-1] - time_arr[-2]), cnt, cnt_nf, (100. * cnt_nf / cnt), cnt_nf - cnt_nf_f_cap, (100. * (cnt_nf - cnt_nf_f_cap) / cnt) ) dictionaries = env["input"]["dictionaries"] aff_file = os.path.join(env["start_dir"], env["input"]["dir"], dictionaries + ".aff") dic_file = os.path.join(env["start_dir"], env["input"]["dir"], dictionaries + ".dic") wiki_words_input = os.path.join(env["start_dir"], env["output"]["dir"], env["output"]["wiki_words"]) log_every_n = env["log_every_n"] wiki_not_found_output = os.path.join(env["start_dir"], env["temp"]["dir"], env["temp"]["wiki_not_found"]) if not os.path.exists(aff_file): raise Exception("AFF file not found [%s]" % aff_file) if not os.path.exists(dic_file): raise Exception("DIC file not found [%s]" % dic_file) if not os.path.exists(wiki_words_input): raise Exception("Wiki words input not found [%s]" % wiki_words_input) from pyspell import speller s = speller(aff_file, dic_file) s.init() ignorecase = False pos = 0 not_found = 0 not_found_first_cap = 0 time_arr = [time.time()] _logger.info("Checking words...") with codecs.open(wiki_not_found_output, mode="w+", encoding="utf-8") as fout: with codecs.open(wiki_words_input, mode="r+", encoding="utf-8") as fin: not_found_arr = [] for l in fin: pos += 1 l = l.strip() ret = s.check(l, ignorecase) if ret is None: not_found += 1 if l[0].isupper(): not_found_first_cap += 1 # _logger.info(u"Not found: [%s]", l) not_found_arr.append(l) for i in range(100): if 10000 < len(not_found_arr): # similar to u'\n'.join() fout.writelines(not_found_arr) not_found_arr = [] if 0 == pos % log_every_n: _logger.info(_progress(pos, not_found, not_found_first_cap, time_arr)) fout.writelines(not_found_arr) _logger.info(_progress(pos, not_found, not_found_first_cap, time_arr))
# -*- coding: utf-8 -*- # author: jm import codecs import test from pyspell import speller from pyspell._utils import line_strip if __name__ == "__main__": aff_file = test.files.aff_file()[0] dic_file = test.files.dic_file()[0] s = speller( aff_file, dic_file ) s.init() LOOP = 5 with codecs.open(dic_file, mode="r+", encoding="utf-8") as fin: fin.next() for i, l in enumerate(fin): w = line_strip(l).split("/")[0] # errors in dict if " " in w: continue for i in range(LOOP): s.check(w) s.check(w + "ehmmm") if 0 == (i + 1) % 10000: print "done [%d]" % i
def test_inspect(self): """ test_inspect """ from pyspell import speller s = speller(self.aff_file(), self.dic_file()) s.init() s.inspect(lambda x: self.log(x))
# -*- coding: utf-8 -*- # author: jm import codecs import test from pyspell import speller from pyspell._utils import line_strip if __name__ == "__main__": aff_file = test.files.aff_file()[0] dic_file = test.files.dic_file()[0] s = speller(aff_file, dic_file) s.init() LOOP = 5 with codecs.open(dic_file, mode="r+", encoding="utf-8") as fin: fin.next() for i, l in enumerate(fin): w = line_strip(l).split("/")[0] # errors in dict if " " in w: continue for i in range(LOOP): s.check(w) s.check(w + "ehmmm") if 0 == (i + 1) % 10000: print "done [%d]" % i
def test_inspect( self ): """ test_inspect """ from pyspell import speller s = speller(self.aff_file(), self.dic_file()) s.init() s.inspect(lambda x: self.log(x))