Esempio n. 1
0
    def setup(self):
        self.spellchecker = Mock()
        self.english_spellchecker = Mock()
        self.config = {"typo-correction-level": 2}

        self.corrector = AutoCorrector(
            config=self.config,
            spellchecker=self.spellchecker,
            english_spellchecker=self.english_spellchecker)
    def test_auto_corrector(self):
        auto_corrector = AutoCorrector()
        for page in self.pages:
            with open(page[3].replace('.txt','_nc.json')) as f:
                responses = json.load(f)
            for result in responses['results']:
                result['corrected_content'] =\
                                            auto_corrector.auto_correct(result)

            for c in responses['results']:
                self.assertIsNotNone(c.get('corrected_content'))

            '''
 def setUp(self):
     self.test_correcton_files = {
         'ca-ES': [
             '3422059e057636482a2230c3aa87dfeb.json',
             '36b63320f3a0e3be9fb5db9f6977ff2d.json',
             '4640bc501a5249d012cd8fa1db31bd77.json',
             'ef4f48c7860cba1910edafe6c7dbd332.json'
         ],
         'en-US': [
             '3c0b0033e4ae2e8182451a22159badea.json',
             '9e1b1202e55d4add7e376d451e3afa64.json'
         ]
     }
     self.test_corrector = AutoCorrector()
Esempio n. 4
0
    def __init__(self, exec_by_ibus):
        engine_name = "bogo"
        long_engine_name = "BoGo"
        author = "BoGo Development Team <*****@*****.**>"
        description = "ibus-bogo for IBus"
        version = "0.4"
        license = "GPLv3"

        self.component = \
            IBus.Component.new("org.freedesktop.IBus.BoGo",
                               description,
                               version,
                               license,
                               author,
                               "https://github.com/BoGoEngine/ibus-bogo",
                               "/usr/bin/exec",
                               "ibus-bogo")

        engine = IBus.EngineDesc(
            name=engine_name,
            longname=long_engine_name,
            description=description,
            language="vi",
            license=license,
            author=author,
            icon=current_path + "/data/ibus-bogo-dev.svg",
            # icon = "ibus-bogo",
            layout="default")

        self.component.add_engine(engine)
        self.mainloop = GObject.MainLoop()
        self.bus = IBus.Bus()
        self.bus.connect("disconnected", self.bus_disconnected_cb)

        self.engine_count = 0
        self.factory = IBus.Factory.new(self.bus.get_connection())
        self.factory.connect("create-engine", self.create_engine)

        CONFIG_DIR = os.path.expanduser("~/.config/ibus-bogo/")
        self.config = Config()
        self.abbr_expander = AbbreviationExpander(config=self.config)
        self.abbr_expander.watch_file(CONFIG_DIR + "/abbr_rules.json")

        if exec_by_ibus:
            self.bus.request_name("org.freedesktop.IBus.BoGo", 0)
        else:
            self.bus.register_component(self.component)
            self.bus.set_global_engine_async("bogo", -1, None, None, None)
        custom_broker = enchant.Broker()
        custom_broker.set_param('enchant.myspell.dictionary.path', DICT_PATH)

        spellchecker = enchant.DictWithPWL('vi_VN_telex',
                                           pwl=PWL_PATH,
                                           broker=custom_broker)

        # FIXME: Catch enchant.errors.DictNotFoundError exception here.
        english_spellchecker = enchant.Dict('en_US')

        self.auto_corrector = AutoCorrector(self.config, spellchecker,
                                            english_spellchecker)
Esempio n. 5
0
 def __init__(self, botname, host = 'teixidora', languagetool = LT_URL):
     # initializes the connection to teixidora semantic wiki
     if host not in HOSTS:
         msg = 'given host %s not in defined hosts: %s'%(host, str(HOSTS))
         logging.error(msg)
         raise ValueError(msg)
     self.site = pywikibot.Site('ca', host)
     self.botname = botname
     self.languagetool = LT_URL
     self.online = False
     self.params = {"bot import": None, 
                    "bot correction": None,
                    "human review": None}
     self.outname = None
     self.declared_language = None
     self.local_corpus = set()
     self.get_global_corpus()
     self.auto_corrector = AutoCorrector()
    def setup(self):
        self.spellchecker = Mock()
        self.english_spellchecker = Mock()
        self.config = {
            "typo-correction-level": 2
        }

        self.corrector = AutoCorrector(
            config=self.config,
            spellchecker=self.spellchecker,
            english_spellchecker=self.english_spellchecker)
class AutoCorrectorTestCase(unittest.TestCase):
    def setUp(self):
        self.test_correcton_files = {
            'ca-ES': [
                '3422059e057636482a2230c3aa87dfeb.json',
                '36b63320f3a0e3be9fb5db9f6977ff2d.json',
                '4640bc501a5249d012cd8fa1db31bd77.json',
                'ef4f48c7860cba1910edafe6c7dbd332.json'
            ],
            'en-US': [
                '3c0b0033e4ae2e8182451a22159badea.json',
                '9e1b1202e55d4add7e376d451e3afa64.json'
            ]
        }
        self.test_corrector = AutoCorrector()

    def tearDown(self):
        pass

    def test_auto_correct(self):
        for lang, files in self.test_correcton_files.items():
            print('testing for', lang)
            test_file = os.path.join(CACHE_FILES_PATH, files[0])
            with open(test_file) as tf:
                for result in json.load(tf)['results']:
                    corrected_content = self.test_corrector.auto_correct(
                        result)

                    # assert that the result is string that has a similar length
                    # to the input content
                    # format check
                    difference = abs(
                        len(result['content']) - len(corrected_content))
                    self.assertLessEqual(difference / len(result['content']),
                                         0.1)

                    # do language checks
                    if lang == 'ca-ES':
                        pass
Esempio n. 8
0
    def __init__(self, config, abbr_expander):
        super().__init__()

        self.caps = 0
        self.vietnameseMode = True

        self.config = config
        self.ui_delegate = UiDelegate(engine=self)

        custom_broker = enchant.Broker()
        custom_broker.set_param('enchant.myspell.dictionary.path', DICT_PATH)

        spellchecker = enchant.DictWithPWL('vi_VN_telex',
                                           pwl=PWL_PATH,
                                           broker=custom_broker)

        # FIXME: Catch enchant.errors.DictNotFoundError exception here.
        english_spellchecker = enchant.Dict('en_US')

        auto_corrector = AutoCorrector(config, spellchecker,
                                       english_spellchecker)

        self.preedit_backend = PreeditBackend(engine=self,
                                              config=config,
                                              abbr_expander=abbr_expander,
                                              auto_corrector=auto_corrector)

        self.surrounding_text_backend = SurroundingTextBackend(
            engine=self,
            config=config,
            abbr_expander=abbr_expander,
            auto_corrector=auto_corrector)

        # The preedit backend is the default
        self.backend = self.preedit_backend
        self.reset()
Esempio n. 9
0
class TestAutoCorrector():
    def setup(self):
        self.spellchecker = Mock()
        self.english_spellchecker = Mock()
        self.config = {"typo-correction-level": 2}

        self.corrector = AutoCorrector(
            config=self.config,
            spellchecker=self.spellchecker,
            english_spellchecker=self.english_spellchecker)

    def test_skip_blacklisted(self):
        """
        It should not auto-correct key sequences that are blacklisted.
        """
        # Blacklisted means spellchecker.check() returns True
        self.spellchecker.check = Mock(return_value=True)

        sequence = "carl"
        eq_(self.corrector.suggest(sequence), sequence)

    def test_no_suggestion(self):
        """
        It should return the input if there is no suggestion.
        """
        self.spellchecker.check = Mock(return_value=False)
        self.spellchecker.suggest = Mock(return_value=[])

        sequence = "carl"
        eq_(self.corrector.suggest(sequence), sequence)

    def test_missing_space(self):
        """
        It should correct 2 words joined together without space.
        """
        self.spellchecker.check = Mock(return_value=False)
        self.spellchecker.suggest = Mock(return_value=["cas meof"])
        self.english_spellchecker.check = Mock(return_value=False)

        sequence = "casmeof"
        result = "cá mèo"
        eq_(self.corrector.suggest(sequence), result)

    def test_level_zero_is_disable(self):
        """
        It should return the input if the typo correction level is
        zero.
        """
        self.spellchecker.check = Mock(return_value=False)
        self.spellchecker.suggest = Mock(return_value=["cas meof"])
        self.english_spellchecker.check = Mock(return_value=False)
        self.config["typo-correction-level"] = 0

        sequence = "casmeof"
        eq_(self.corrector.suggest(sequence), sequence)

    def test_skip_english(self):
        """
        It should skip sequences that are deemed to be English
        by the English spellchecker.
        """
        self.spellchecker.check = Mock(return_value=False)
        self.spellchecker.suggest = Mock(return_value=["ser"])
        self.english_spellchecker.check = Mock(return_value=True)

        sequence = "set"
        eq_(self.corrector.suggest(sequence), sequence)

    def test_blacklist_after_n_offences(self):
        """
        It should blacklist a key sequence after N tickets. N is
        specified by the config dictionary.
        """
        self.spellchecker.check = Mock(return_value=False)
        self.spellchecker.suggest = Mock(return_value=["car"])

        self.config["typo-correction-threshold"] = 2

        sequence = "carl"

        for i in range(self.config["typo-correction-threshold"]):
            self.corrector.increase_ticket(sequence)

        self.spellchecker.add.assert_called_once_with(sequence)
Esempio n. 10
0
class Bot(object):
    def __init__(self, botname, host = 'teixidora', languagetool = LT_URL):
        # initializes the connection to teixidora semantic wiki
        if host not in HOSTS:
            msg = 'given host %s not in defined hosts: %s'%(host, str(HOSTS))
            logging.error(msg)
            raise ValueError(msg)
        self.site = pywikibot.Site('ca', host)
        self.botname = botname
        self.languagetool = LT_URL
        self.online = False
        self.params = {"bot import": None, 
                       "bot correction": None,
                       "human review": None}
        self.outname = None
        self.declared_language = None
        self.local_corpus = set()
        self.get_global_corpus()
        self.auto_corrector = AutoCorrector()

    def get_global_corpus(self):
        # TODO better file path handling
        if not os.path.exists(cache_filepath):
            global_corpus_dict = get_global_corpora(self.site)
        else:
            with open(cache_filepath) as cf:
                global_corpus_dict = json.load(cf)

        tokens = []
        for key, name_list in global_corpus_dict.items():
            if key not in ['exists', 'stop_words']:
                for name in name_list:
                    tokens += [clean_token(n.lower()) for n in name.split()]
            elif key == 'stop_words':
                # stop words can be compound
                for name in name_list:
                    tokens += [name.lower()]
        # convert list to set eliminating the empty strings
        self.global_corpus = set([token for token in tokens if token])

    def get_page(self, title_or_page):
        # get a new teixidora page initializing the rest of the variables
        if type(title_or_page) == str:
            self.title = title_or_page
            self.page = pywikibot.Page(self.site, self.title)
        else:
            self.page = title_or_page
            self.title = self.page.title()

        if not self.page.text:
            msg = "%s does not exist or not reachable"%title_or_page
            logging.warning(msg)
            #raise ValueError(msg)
        self.wikicode = mwparserfromhell.parse(self.page.text)

        # get bot correction and human review parameters
        self.get_correction_status()

        # get cache out file hash
        # TODO push to a db and use hash as the key
        h = hashlib.md5(self.title.encode('utf8'))
        self.outname = h.hexdigest()+'.json'
        self.outpath = os.path.join(PATH, 'cache/'+self.outname)

        # clean the notes and corrected notes objects if they were full
        self.notes = []
        self.corrected_notes = {}

        # get declared language
        self.get_declared_language()

        # get mentioned elements from semantic fields
        self.get_local_corpus()
        corpus = self.local_corpus.union(self.global_corpus)
        self.auto_corrector.corpus = corpus.difference(STOP_TOKENS)

    def get_correction_status(self):
        # for each page the parameters should be "resetted"
        self.params = {"bot import": None,
                       "bot correction": None,
                       "human review": None}

        for template in self.wikicode.filter_templates():
            for param in template.params:
                for key in self.params.keys():
                    if param.startswith(key):
                        i = len(key)+1
                        self.params[key] = param[i:].strip()

    def get_declared_language(self):
        lan_param = 'language'
        language = None
        for template in self.wikicode.filter_templates():
            for param in template.params:
                if param.startswith(lan_param):
                    language = template.get(lan_param)[len(lan_param)+1:]\
                                       .lower()
                    break
        # convert language to language code due to non-standard language
        # naming convenion
        if language:
            for lan_code, re_lan in RE_LANGS.items():
                if re_lan.search(language):
                    self.declared_language = lan_code
            if not self.declared_language:
                msg = 'WARNING: unknown language in the wiki page of the event %s'\
                      ''%language
                logging.warning(msg)
        else:
            msg = 'language not declared for the page %s'%self.title
            logging.debug(msg)

    def get_local_corpus(self):
        # tokens extracted here will be ignored in the correction
        # implementation
        fields = ['projects mentioned', 'keywords', 'organizer',
                  'organizations mentioned', 'speakers',
                  'keywords in English', 'individuals mentioned']
        for field in fields:
            for template in self.wikicode.filter_templates():
                for param in template.params:
                    if param.startswith('%s='%field):
                        # we are interested in tokens not concepts hence
                        # we first get rid of the commas and then split
                        f_elements = template.get(field)
                        if f_elements:
                            elements_str = f_elements[len(field)+1:]\
                                                    .replace(',','')
                            elements = set(elements_str.strip().lower().split())
                            self.local_corpus = self.local_corpus.union(elements)
                            break
        # remove symbols if they appear as tokens
        stop_signs = set(['-', '?', '!', '/', '\\', '"', "'"])
        self.local_corpus = self.local_corpus.difference(stop_signs)

    def correct_notes(self, online=False):
        self.online = online
        self.get_note_titles()
        if not self.notes:
           message = "no apunts url found for: %s"%self.title
        for note in self.notes:
            self.corrected_notes[note] = self.correct_note(note)

    def get_note_titles(self):
        # placeholder to extract the apunts links
        # currently unefficiently checks if the urls are full
        # checks first the new format, only if it doesn't exist checks the old
        new_format = '/'.join([self.title, 'apunts', '01'])
        old_format = '/'.join([self.title, 'apunts'])
        note_page = pywikibot.Page(self.site, new_format)
        if note_page.text:
            self.notes = [new_format]
        else:
            note_page = pywikibot.Page(self.site, old_format)
            if note_page.text:
                self.notes = [old_format]

    def correct_note(self, note):
        note_page = pywikibot.Page(self.site, note)
        # TODO extract only the content?
        content = note_page.text
        language = self.get_language(content)
        # TODO send the content to be corrected according to the LT rules
        return self.correct_content(content, language)

    def get_language(self, content):
        # TODO currently done in corrector per paragraph
        if self.declared_language:
            return self.declared_language
        else:
            return 'ca-ES'

    def correct_content(self, content, language):
        # TODO to be moved to LT processes class
        # Segments and sends the content to LT according to the
        # public api rate limits
        # http://wiki.languagetool.org/public-http-api

        if os.path.isfile(self.outpath):
            msg = 'title exists in cache: %s'%self.title
            print(self.outpath)
            print(msg)
            logging.info(msg)
            with open(self.outpath) as f:
                responses = json.load(f)
            return responses
        else:
            responses = {'title': self.title, 'results': []}
            if self.online:
                per_req_size_limit = 6e3 # KB
                sentences = content.split('. ')
                requests = []
                test_chunks = []
                chunk = []
                for sentence in sentences:
                    chunk.append(sentence)
                    total_chunk = '. '.join(chunk)
                    if sys.getsizeof(total_chunk) > per_req_size_limit:
                        requests.append(total_chunk)
                        test_chunks.append((chunk[0], chunk[-1]))
                        chunk = []
                if chunk:
                    # add last chunk
                    requests.append('. '.join(chunk))
                    test_chunks.append((chunk[0], chunk[-1]))

                # send requests to api
                # TODO smarter rate limit control needed
                total_requests = len(requests)
                for i, request in enumerate(requests):
                    try:
                        response = api.check(request,
                                         api_url=self.languagetool,
                                         lang=language)
                    # TODO check language, if confidence lower than 0.90 resend
                    except Exception as e:
                        msg = "%s language error. Trying to detect the language."\
                              ""%language
                        logging.warning(msg)
                        response = api.check(test_chunks[i][1],
                                         api_url=self.languagetool,
                                         lang=language)
                        language_bottom = response['language']['detectedLanguage']['code']
                        response = api.check(test_chunks[i][0],
                                         api_url=self.languagetool,
                                         lang=language_bottom)
                        language_top = response['language']['detectedLanguage']['code']
                        if language != language_top:
                            language = language_top
                        else:
                            language = language_bottom
                        msg = "%s detected as new language"%language
                        logging.info(msg)
                        response = api.check(request,
                                         api_url=self.languagetool,
                                         lang=language)
                    message = '%i/%i response sent'%(i+1, total_requests)
                    print(message)
                    logging.info(message)
                    if i+1 != total_requests:
                        # wait at all except the last LT api call
                        time.sleep(4)
                    responses['results'].append({'content': request,
                                                   'response': response})
            else:
                chunks = corrector.get_chunks(content)
                corrector.correct(chunks, responses)

            with open(self.outpath, 'w') as out:
                json.dump(responses, out, indent = 2)
            return responses

    def implement_corrections(self):
        self.targets = []
        if self.corrected_notes:
            # implements the corrections and pushes the results in
            # self.corrected_notes[note]['results'][i]['corrected_content']
            for url, responses in self.corrected_notes.items():
                for result in responses['results']:
                    final_corrected_content =\
                                   self.auto_corrector.auto_correct(result)
                    result['corrected_content'] = final_corrected_content
                target = [url,
                       '. '.join([c['content'] for c in responses['results']]),
                       '. '.join([c['corrected_content']\
                                              for c in responses['results']])]
                self.targets.append(target)
                with open(self.outpath.replace('.json', '_c.json'), 'w') as out:
                    json.dump(responses, out, indent = 2)
        else:
            msg = 'no corrections found for %s'%self.title
            logging.warning(msg)

    def send_corrections(self):
        for url, content, corrected_content in self.targets:
            # TODO add labels for revised=False
            # TODO check if correction webpage exists
            correction_page = pywikibot.Page(self.site, url+'/correccions')
            correction_page.text = content
            correction_page.save('BOT - original content imported from %s'%url)
            correction_page.text = corrected_content
            correction_page.save('BOT - corrections implemented')
        self.change_param_value('bot correction', 'Feta')
        self.change_param_value('human review', 'Pendent')

    def change_param_value(self, param, new_value):
        old_value = self.params[param]
        if old_value:
            new_text = re.sub('%s=%s'%(param, old_value),
                              '%s=%s'%(param, new_value),
                              self.page.text)
        else:
            # assumes there is always bot import parameter
            if self.params["bot import"] == None:
                msg = "cannot tick checkbox bcs parameter is not in the"\
                      " template and the anchor parameter bot import"\
                      " also doesn't exist.\n%s"%self.title
                logging.error(msg)
                raise ValueError(msg)
            else:
                bi_val = self.params['bot import']
                new_text = re.sub('bot import=%s\n'%bi_val,
                                  'bot import=%s\n|%s=%s\n'%(bi_val,
                                                             param,
                                                             new_value),
                                  self.page.text)
                if self.page.text == new_text:
                    msg = "parameter not changed, cannot save a new version"\
                          "\n%s -> %s for %s"%(param, new_value, self.page.title)
                    print(msg)
                    logging.warning(msg)
        self.page.text = new_text
        self.page.save('BOT - %s parameter changed to %s'%(param, new_value))

    def replace_corrected_notes(self):
        self.get_note_titles()
        for note in self.notes:
            correction = note+'/correccions'
            correction_page = pywikibot.Page(self.site, correction)
            if correction_page.text:
                note_page = pywikibot.Page(self.site, note)
                note_page.text = correction_page.text
                note_page.save("BOT - manual corrections implemented")
                self.change_param_value('human review', '')
            else:
                logging.warning("%s not found, manual correction cannot"\
                                " be saved")
Esempio n. 11
0
class TestAutoCorrector():
    def setup(self):
        self.spellchecker = Mock()
        self.english_spellchecker = Mock()
        self.config = {
            "typo-correction-level": 2
        }

        self.corrector = AutoCorrector(
            config=self.config,
            spellchecker=self.spellchecker,
            english_spellchecker=self.english_spellchecker)

    def test_skip_blacklisted(self):
        """
        It should not auto-correct key sequences that are blacklisted.
        """
        # Blacklisted means spellchecker.check() returns True
        self.spellchecker.check = Mock(return_value=True)

        sequence = "carl"
        eq_(self.corrector.suggest(sequence), sequence)

    def test_no_suggestion(self):
        """
        It should return the input if there is no suggestion.
        """
        self.spellchecker.check = Mock(return_value=False)
        self.spellchecker.suggest = Mock(return_value=[])

        sequence = "carl"
        eq_(self.corrector.suggest(sequence), sequence)

    def test_missing_space(self):
        """
        It should correct 2 words joined together without space.
        """
        self.spellchecker.check = Mock(return_value=False)
        self.spellchecker.suggest = Mock(return_value=["cas meof"])
        self.english_spellchecker.check = Mock(return_value=False)

        sequence = "casmeof"
        result = "cá mèo"
        eq_(self.corrector.suggest(sequence), result)

    def test_level_zero_is_disable(self):
        """
        It should return the input if the typo correction level is
        zero.
        """
        self.spellchecker.check = Mock(return_value=False)
        self.spellchecker.suggest = Mock(return_value=["cas meof"])
        self.english_spellchecker.check = Mock(return_value=False)
        self.config["typo-correction-level"] = 0

        sequence = "casmeof"
        eq_(self.corrector.suggest(sequence), sequence)

    def test_skip_english(self):
        """
        It should skip sequences that are deemed to be English
        by the English spellchecker.
        """
        self.spellchecker.check = Mock(return_value=False)
        self.spellchecker.suggest = Mock(return_value=["ser"])
        self.english_spellchecker.check = Mock(return_value=True)

        sequence = "set"
        eq_(self.corrector.suggest(sequence), sequence)

    def test_blacklist_after_n_offences(self):
        """
        It should blacklist a key sequence after N tickets. N is
        specified by the config dictionary.
        """
        self.spellchecker.check = Mock(return_value=False)
        self.spellchecker.suggest = Mock(return_value=["car"])

        self.config["typo-correction-threshold"] = 2

        sequence = "carl"

        for i in range(self.config["typo-correction-threshold"]):
            self.corrector.increase_ticket(sequence)

        self.spellchecker.add.assert_called_once_with(sequence)