def setup(self): self.spellchecker = Mock() self.english_spellchecker = Mock() self.config = {"typo-correction-level": 2} self.corrector = AutoCorrector( config=self.config, spellchecker=self.spellchecker, english_spellchecker=self.english_spellchecker)
def test_auto_corrector(self): auto_corrector = AutoCorrector() for page in self.pages: with open(page[3].replace('.txt','_nc.json')) as f: responses = json.load(f) for result in responses['results']: result['corrected_content'] =\ auto_corrector.auto_correct(result) for c in responses['results']: self.assertIsNotNone(c.get('corrected_content')) '''
def setUp(self): self.test_correcton_files = { 'ca-ES': [ '3422059e057636482a2230c3aa87dfeb.json', '36b63320f3a0e3be9fb5db9f6977ff2d.json', '4640bc501a5249d012cd8fa1db31bd77.json', 'ef4f48c7860cba1910edafe6c7dbd332.json' ], 'en-US': [ '3c0b0033e4ae2e8182451a22159badea.json', '9e1b1202e55d4add7e376d451e3afa64.json' ] } self.test_corrector = AutoCorrector()
def __init__(self, exec_by_ibus): engine_name = "bogo" long_engine_name = "BoGo" author = "BoGo Development Team <*****@*****.**>" description = "ibus-bogo for IBus" version = "0.4" license = "GPLv3" self.component = \ IBus.Component.new("org.freedesktop.IBus.BoGo", description, version, license, author, "https://github.com/BoGoEngine/ibus-bogo", "/usr/bin/exec", "ibus-bogo") engine = IBus.EngineDesc( name=engine_name, longname=long_engine_name, description=description, language="vi", license=license, author=author, icon=current_path + "/data/ibus-bogo-dev.svg", # icon = "ibus-bogo", layout="default") self.component.add_engine(engine) self.mainloop = GObject.MainLoop() self.bus = IBus.Bus() self.bus.connect("disconnected", self.bus_disconnected_cb) self.engine_count = 0 self.factory = IBus.Factory.new(self.bus.get_connection()) self.factory.connect("create-engine", self.create_engine) CONFIG_DIR = os.path.expanduser("~/.config/ibus-bogo/") self.config = Config() self.abbr_expander = AbbreviationExpander(config=self.config) self.abbr_expander.watch_file(CONFIG_DIR + "/abbr_rules.json") if exec_by_ibus: self.bus.request_name("org.freedesktop.IBus.BoGo", 0) else: self.bus.register_component(self.component) self.bus.set_global_engine_async("bogo", -1, None, None, None) custom_broker = enchant.Broker() custom_broker.set_param('enchant.myspell.dictionary.path', DICT_PATH) spellchecker = enchant.DictWithPWL('vi_VN_telex', pwl=PWL_PATH, broker=custom_broker) # FIXME: Catch enchant.errors.DictNotFoundError exception here. english_spellchecker = enchant.Dict('en_US') self.auto_corrector = AutoCorrector(self.config, spellchecker, english_spellchecker)
def __init__(self, botname, host = 'teixidora', languagetool = LT_URL): # initializes the connection to teixidora semantic wiki if host not in HOSTS: msg = 'given host %s not in defined hosts: %s'%(host, str(HOSTS)) logging.error(msg) raise ValueError(msg) self.site = pywikibot.Site('ca', host) self.botname = botname self.languagetool = LT_URL self.online = False self.params = {"bot import": None, "bot correction": None, "human review": None} self.outname = None self.declared_language = None self.local_corpus = set() self.get_global_corpus() self.auto_corrector = AutoCorrector()
def setup(self): self.spellchecker = Mock() self.english_spellchecker = Mock() self.config = { "typo-correction-level": 2 } self.corrector = AutoCorrector( config=self.config, spellchecker=self.spellchecker, english_spellchecker=self.english_spellchecker)
class AutoCorrectorTestCase(unittest.TestCase): def setUp(self): self.test_correcton_files = { 'ca-ES': [ '3422059e057636482a2230c3aa87dfeb.json', '36b63320f3a0e3be9fb5db9f6977ff2d.json', '4640bc501a5249d012cd8fa1db31bd77.json', 'ef4f48c7860cba1910edafe6c7dbd332.json' ], 'en-US': [ '3c0b0033e4ae2e8182451a22159badea.json', '9e1b1202e55d4add7e376d451e3afa64.json' ] } self.test_corrector = AutoCorrector() def tearDown(self): pass def test_auto_correct(self): for lang, files in self.test_correcton_files.items(): print('testing for', lang) test_file = os.path.join(CACHE_FILES_PATH, files[0]) with open(test_file) as tf: for result in json.load(tf)['results']: corrected_content = self.test_corrector.auto_correct( result) # assert that the result is string that has a similar length # to the input content # format check difference = abs( len(result['content']) - len(corrected_content)) self.assertLessEqual(difference / len(result['content']), 0.1) # do language checks if lang == 'ca-ES': pass
def __init__(self, config, abbr_expander): super().__init__() self.caps = 0 self.vietnameseMode = True self.config = config self.ui_delegate = UiDelegate(engine=self) custom_broker = enchant.Broker() custom_broker.set_param('enchant.myspell.dictionary.path', DICT_PATH) spellchecker = enchant.DictWithPWL('vi_VN_telex', pwl=PWL_PATH, broker=custom_broker) # FIXME: Catch enchant.errors.DictNotFoundError exception here. english_spellchecker = enchant.Dict('en_US') auto_corrector = AutoCorrector(config, spellchecker, english_spellchecker) self.preedit_backend = PreeditBackend(engine=self, config=config, abbr_expander=abbr_expander, auto_corrector=auto_corrector) self.surrounding_text_backend = SurroundingTextBackend( engine=self, config=config, abbr_expander=abbr_expander, auto_corrector=auto_corrector) # The preedit backend is the default self.backend = self.preedit_backend self.reset()
class TestAutoCorrector(): def setup(self): self.spellchecker = Mock() self.english_spellchecker = Mock() self.config = {"typo-correction-level": 2} self.corrector = AutoCorrector( config=self.config, spellchecker=self.spellchecker, english_spellchecker=self.english_spellchecker) def test_skip_blacklisted(self): """ It should not auto-correct key sequences that are blacklisted. """ # Blacklisted means spellchecker.check() returns True self.spellchecker.check = Mock(return_value=True) sequence = "carl" eq_(self.corrector.suggest(sequence), sequence) def test_no_suggestion(self): """ It should return the input if there is no suggestion. """ self.spellchecker.check = Mock(return_value=False) self.spellchecker.suggest = Mock(return_value=[]) sequence = "carl" eq_(self.corrector.suggest(sequence), sequence) def test_missing_space(self): """ It should correct 2 words joined together without space. """ self.spellchecker.check = Mock(return_value=False) self.spellchecker.suggest = Mock(return_value=["cas meof"]) self.english_spellchecker.check = Mock(return_value=False) sequence = "casmeof" result = "cá mèo" eq_(self.corrector.suggest(sequence), result) def test_level_zero_is_disable(self): """ It should return the input if the typo correction level is zero. """ self.spellchecker.check = Mock(return_value=False) self.spellchecker.suggest = Mock(return_value=["cas meof"]) self.english_spellchecker.check = Mock(return_value=False) self.config["typo-correction-level"] = 0 sequence = "casmeof" eq_(self.corrector.suggest(sequence), sequence) def test_skip_english(self): """ It should skip sequences that are deemed to be English by the English spellchecker. """ self.spellchecker.check = Mock(return_value=False) self.spellchecker.suggest = Mock(return_value=["ser"]) self.english_spellchecker.check = Mock(return_value=True) sequence = "set" eq_(self.corrector.suggest(sequence), sequence) def test_blacklist_after_n_offences(self): """ It should blacklist a key sequence after N tickets. N is specified by the config dictionary. """ self.spellchecker.check = Mock(return_value=False) self.spellchecker.suggest = Mock(return_value=["car"]) self.config["typo-correction-threshold"] = 2 sequence = "carl" for i in range(self.config["typo-correction-threshold"]): self.corrector.increase_ticket(sequence) self.spellchecker.add.assert_called_once_with(sequence)
class Bot(object): def __init__(self, botname, host = 'teixidora', languagetool = LT_URL): # initializes the connection to teixidora semantic wiki if host not in HOSTS: msg = 'given host %s not in defined hosts: %s'%(host, str(HOSTS)) logging.error(msg) raise ValueError(msg) self.site = pywikibot.Site('ca', host) self.botname = botname self.languagetool = LT_URL self.online = False self.params = {"bot import": None, "bot correction": None, "human review": None} self.outname = None self.declared_language = None self.local_corpus = set() self.get_global_corpus() self.auto_corrector = AutoCorrector() def get_global_corpus(self): # TODO better file path handling if not os.path.exists(cache_filepath): global_corpus_dict = get_global_corpora(self.site) else: with open(cache_filepath) as cf: global_corpus_dict = json.load(cf) tokens = [] for key, name_list in global_corpus_dict.items(): if key not in ['exists', 'stop_words']: for name in name_list: tokens += [clean_token(n.lower()) for n in name.split()] elif key == 'stop_words': # stop words can be compound for name in name_list: tokens += [name.lower()] # convert list to set eliminating the empty strings self.global_corpus = set([token for token in tokens if token]) def get_page(self, title_or_page): # get a new teixidora page initializing the rest of the variables if type(title_or_page) == str: self.title = title_or_page self.page = pywikibot.Page(self.site, self.title) else: self.page = title_or_page self.title = self.page.title() if not self.page.text: msg = "%s does not exist or not reachable"%title_or_page logging.warning(msg) #raise ValueError(msg) self.wikicode = mwparserfromhell.parse(self.page.text) # get bot correction and human review parameters self.get_correction_status() # get cache out file hash # TODO push to a db and use hash as the key h = hashlib.md5(self.title.encode('utf8')) self.outname = h.hexdigest()+'.json' self.outpath = os.path.join(PATH, 'cache/'+self.outname) # clean the notes and corrected notes objects if they were full self.notes = [] self.corrected_notes = {} # get declared language self.get_declared_language() # get mentioned elements from semantic fields self.get_local_corpus() corpus = self.local_corpus.union(self.global_corpus) self.auto_corrector.corpus = corpus.difference(STOP_TOKENS) def get_correction_status(self): # for each page the parameters should be "resetted" self.params = {"bot import": None, "bot correction": None, "human review": None} for template in self.wikicode.filter_templates(): for param in template.params: for key in self.params.keys(): if param.startswith(key): i = len(key)+1 self.params[key] = param[i:].strip() def get_declared_language(self): lan_param = 'language' language = None for template in self.wikicode.filter_templates(): for param in template.params: if param.startswith(lan_param): language = template.get(lan_param)[len(lan_param)+1:]\ .lower() break # convert language to language code due to non-standard language # naming convenion if language: for lan_code, re_lan in RE_LANGS.items(): if re_lan.search(language): self.declared_language = lan_code if not self.declared_language: msg = 'WARNING: unknown language in the wiki page of the event %s'\ ''%language logging.warning(msg) else: msg = 'language not declared for the page %s'%self.title logging.debug(msg) def get_local_corpus(self): # tokens extracted here will be ignored in the correction # implementation fields = ['projects mentioned', 'keywords', 'organizer', 'organizations mentioned', 'speakers', 'keywords in English', 'individuals mentioned'] for field in fields: for template in self.wikicode.filter_templates(): for param in template.params: if param.startswith('%s='%field): # we are interested in tokens not concepts hence # we first get rid of the commas and then split f_elements = template.get(field) if f_elements: elements_str = f_elements[len(field)+1:]\ .replace(',','') elements = set(elements_str.strip().lower().split()) self.local_corpus = self.local_corpus.union(elements) break # remove symbols if they appear as tokens stop_signs = set(['-', '?', '!', '/', '\\', '"', "'"]) self.local_corpus = self.local_corpus.difference(stop_signs) def correct_notes(self, online=False): self.online = online self.get_note_titles() if not self.notes: message = "no apunts url found for: %s"%self.title for note in self.notes: self.corrected_notes[note] = self.correct_note(note) def get_note_titles(self): # placeholder to extract the apunts links # currently unefficiently checks if the urls are full # checks first the new format, only if it doesn't exist checks the old new_format = '/'.join([self.title, 'apunts', '01']) old_format = '/'.join([self.title, 'apunts']) note_page = pywikibot.Page(self.site, new_format) if note_page.text: self.notes = [new_format] else: note_page = pywikibot.Page(self.site, old_format) if note_page.text: self.notes = [old_format] def correct_note(self, note): note_page = pywikibot.Page(self.site, note) # TODO extract only the content? content = note_page.text language = self.get_language(content) # TODO send the content to be corrected according to the LT rules return self.correct_content(content, language) def get_language(self, content): # TODO currently done in corrector per paragraph if self.declared_language: return self.declared_language else: return 'ca-ES' def correct_content(self, content, language): # TODO to be moved to LT processes class # Segments and sends the content to LT according to the # public api rate limits # http://wiki.languagetool.org/public-http-api if os.path.isfile(self.outpath): msg = 'title exists in cache: %s'%self.title print(self.outpath) print(msg) logging.info(msg) with open(self.outpath) as f: responses = json.load(f) return responses else: responses = {'title': self.title, 'results': []} if self.online: per_req_size_limit = 6e3 # KB sentences = content.split('. ') requests = [] test_chunks = [] chunk = [] for sentence in sentences: chunk.append(sentence) total_chunk = '. '.join(chunk) if sys.getsizeof(total_chunk) > per_req_size_limit: requests.append(total_chunk) test_chunks.append((chunk[0], chunk[-1])) chunk = [] if chunk: # add last chunk requests.append('. '.join(chunk)) test_chunks.append((chunk[0], chunk[-1])) # send requests to api # TODO smarter rate limit control needed total_requests = len(requests) for i, request in enumerate(requests): try: response = api.check(request, api_url=self.languagetool, lang=language) # TODO check language, if confidence lower than 0.90 resend except Exception as e: msg = "%s language error. Trying to detect the language."\ ""%language logging.warning(msg) response = api.check(test_chunks[i][1], api_url=self.languagetool, lang=language) language_bottom = response['language']['detectedLanguage']['code'] response = api.check(test_chunks[i][0], api_url=self.languagetool, lang=language_bottom) language_top = response['language']['detectedLanguage']['code'] if language != language_top: language = language_top else: language = language_bottom msg = "%s detected as new language"%language logging.info(msg) response = api.check(request, api_url=self.languagetool, lang=language) message = '%i/%i response sent'%(i+1, total_requests) print(message) logging.info(message) if i+1 != total_requests: # wait at all except the last LT api call time.sleep(4) responses['results'].append({'content': request, 'response': response}) else: chunks = corrector.get_chunks(content) corrector.correct(chunks, responses) with open(self.outpath, 'w') as out: json.dump(responses, out, indent = 2) return responses def implement_corrections(self): self.targets = [] if self.corrected_notes: # implements the corrections and pushes the results in # self.corrected_notes[note]['results'][i]['corrected_content'] for url, responses in self.corrected_notes.items(): for result in responses['results']: final_corrected_content =\ self.auto_corrector.auto_correct(result) result['corrected_content'] = final_corrected_content target = [url, '. '.join([c['content'] for c in responses['results']]), '. '.join([c['corrected_content']\ for c in responses['results']])] self.targets.append(target) with open(self.outpath.replace('.json', '_c.json'), 'w') as out: json.dump(responses, out, indent = 2) else: msg = 'no corrections found for %s'%self.title logging.warning(msg) def send_corrections(self): for url, content, corrected_content in self.targets: # TODO add labels for revised=False # TODO check if correction webpage exists correction_page = pywikibot.Page(self.site, url+'/correccions') correction_page.text = content correction_page.save('BOT - original content imported from %s'%url) correction_page.text = corrected_content correction_page.save('BOT - corrections implemented') self.change_param_value('bot correction', 'Feta') self.change_param_value('human review', 'Pendent') def change_param_value(self, param, new_value): old_value = self.params[param] if old_value: new_text = re.sub('%s=%s'%(param, old_value), '%s=%s'%(param, new_value), self.page.text) else: # assumes there is always bot import parameter if self.params["bot import"] == None: msg = "cannot tick checkbox bcs parameter is not in the"\ " template and the anchor parameter bot import"\ " also doesn't exist.\n%s"%self.title logging.error(msg) raise ValueError(msg) else: bi_val = self.params['bot import'] new_text = re.sub('bot import=%s\n'%bi_val, 'bot import=%s\n|%s=%s\n'%(bi_val, param, new_value), self.page.text) if self.page.text == new_text: msg = "parameter not changed, cannot save a new version"\ "\n%s -> %s for %s"%(param, new_value, self.page.title) print(msg) logging.warning(msg) self.page.text = new_text self.page.save('BOT - %s parameter changed to %s'%(param, new_value)) def replace_corrected_notes(self): self.get_note_titles() for note in self.notes: correction = note+'/correccions' correction_page = pywikibot.Page(self.site, correction) if correction_page.text: note_page = pywikibot.Page(self.site, note) note_page.text = correction_page.text note_page.save("BOT - manual corrections implemented") self.change_param_value('human review', '') else: logging.warning("%s not found, manual correction cannot"\ " be saved")
class TestAutoCorrector(): def setup(self): self.spellchecker = Mock() self.english_spellchecker = Mock() self.config = { "typo-correction-level": 2 } self.corrector = AutoCorrector( config=self.config, spellchecker=self.spellchecker, english_spellchecker=self.english_spellchecker) def test_skip_blacklisted(self): """ It should not auto-correct key sequences that are blacklisted. """ # Blacklisted means spellchecker.check() returns True self.spellchecker.check = Mock(return_value=True) sequence = "carl" eq_(self.corrector.suggest(sequence), sequence) def test_no_suggestion(self): """ It should return the input if there is no suggestion. """ self.spellchecker.check = Mock(return_value=False) self.spellchecker.suggest = Mock(return_value=[]) sequence = "carl" eq_(self.corrector.suggest(sequence), sequence) def test_missing_space(self): """ It should correct 2 words joined together without space. """ self.spellchecker.check = Mock(return_value=False) self.spellchecker.suggest = Mock(return_value=["cas meof"]) self.english_spellchecker.check = Mock(return_value=False) sequence = "casmeof" result = "cá mèo" eq_(self.corrector.suggest(sequence), result) def test_level_zero_is_disable(self): """ It should return the input if the typo correction level is zero. """ self.spellchecker.check = Mock(return_value=False) self.spellchecker.suggest = Mock(return_value=["cas meof"]) self.english_spellchecker.check = Mock(return_value=False) self.config["typo-correction-level"] = 0 sequence = "casmeof" eq_(self.corrector.suggest(sequence), sequence) def test_skip_english(self): """ It should skip sequences that are deemed to be English by the English spellchecker. """ self.spellchecker.check = Mock(return_value=False) self.spellchecker.suggest = Mock(return_value=["ser"]) self.english_spellchecker.check = Mock(return_value=True) sequence = "set" eq_(self.corrector.suggest(sequence), sequence) def test_blacklist_after_n_offences(self): """ It should blacklist a key sequence after N tickets. N is specified by the config dictionary. """ self.spellchecker.check = Mock(return_value=False) self.spellchecker.suggest = Mock(return_value=["car"]) self.config["typo-correction-threshold"] = 2 sequence = "carl" for i in range(self.config["typo-correction-threshold"]): self.corrector.increase_ticket(sequence) self.spellchecker.add.assert_called_once_with(sequence)