Beispiel #1
0
 def __init__(self, language: str, platform_folder: str,
              embeddings_path: str):
     """
     :param platform_folder: str, like datasets/Websites/FakeBrCorpus/
     :param embeddings_path: str, like embeddings/pt/model.txt
     """
     self.PLATFORM_FOLDER = platform_folder
     binary = True if embeddings_path.split('.')[-1] == 'bin' else False
     self.embeddings = KeyedVectors.load_word2vec_format(
         embeddings_path, binary=binary, unicode_errors='ignore')
     self.LANGUAGE = language
     self.PT_BR_dic = 'Dictionaries/pt_BR/pt_BR.dic'
     self.PT_BR_aff = 'Dictionaries/pt_BR/pt_BR.aff'
     self.EN_US_dic = 'Dictionaries/en_US/en_US.dic'
     self.EN_US_aff = 'Dictionaries/en_US/en_US.aff'
     self.BG_BG_dic = 'Dictionaries/bg_BG/bg.dic'
     self.BG_BG_aff = 'Dictionaries/bg_BG/bg.aff'
     if language == 'pt':
         self.spell_checker = hunspell.HunSpell(self.PT_BR_dic,
                                                self.PT_BR_aff)
     elif language == 'en':
         self.spell_checker = hunspell.HunSpell(self.EN_US_dic,
                                                self.EN_US_aff)
     else:
         self.spell_checker = hunspell.HunSpell(self.BG_BG_dic,
                                                self.BG_BG_aff)
Beispiel #2
0
def test_map_4(hpk_dic_words, hpk_dic_filepath, sb0n_REP10D_MAP5_filepath, sb0n_MAP5_filepath):
    '''
    Note D stands for DoubleVowel
    Example of failure
    ama
    maa sb0n_REP10D_MAP5_suggestions ['mā', 'ama']
    maa sb0n_MAP5_suggestions ['ama', 'maua'] - Show stopper, no 'mā'
    '''
    hobj_sb0n_REP10D_MAP5 = hunspell.HunSpell(hpk_dic_filepath, sb0n_REP10D_MAP5_filepath)
    hobj_sb0n_MAP5 = hunspell.HunSpell(hpk_dic_filepath, sb0n_MAP5_filepath)
    for word in hpk_dic_words:
        if not " " in word and not "-" in word:
            word_as_list = list(word)
            shuffle(word_as_list)
            jumbled_word = ''.join(word_as_list)
            if jumbled_word not in hpk_dic_words:
            # Just test non-compound words, its easier
                sb0n_REP10D_MAP5_suggestions = [x.decode() for x in \
                                          hobj_sb0n_REP10D_MAP5.suggest(jumbled_word)]
                sb0n_MAP5_suggestions = [x.decode() for x in \
                                         hobj_sb0n_MAP5.suggest(jumbled_word)]  
                print(word)
                print(jumbled_word, "sb0n_REP10D_MAP5_suggestions", sb0n_REP10D_MAP5_suggestions)
                print(jumbled_word, "sb0n_MAP5_suggestions", sb0n_MAP5_suggestions)
                assert sorted(sb0n_REP10D_MAP5_suggestions) == sorted(sb0n_MAP5_suggestions)
Beispiel #3
0
    def __init__(self,
                 locale,
                 dict_dir='/usr/share/myspell/dicts/',
                 words=None):
        """Requires a locale to work correctly."""

        parts = re.split(r'\W', locale, 2)
        lang = parts[0].lower()
        try:
            co = parts[1].upper()
        except IndexError:
            co = ''

        locale = u'_'.join([lang, co])

        if path.isfile(dict_dir + locale + '.dic'):
            self.hunspell = hunspell.HunSpell(dict_dir + locale + '.dic',
                                              dict_dir + locale + '.aff')
        elif path.isfile(dict_dir + lang + '.dic'):
            self.hunspell = hunspell.HunSpell(dict_dir + lang + '.dic',
                                              dict_dir + lang + '.aff')
        else:
            self.hunspell = None

        if self.hunspell and words:
            with open(words, 'r') as fp:
                lines = [l.strip() for l in fp.readlines()]
                for line in lines:
                    self.hunspell.add(line)
async def on_message(message):
    # we do not want the bot to reply to itself
    if message.author == client.user:
        return

    if message.content.startswith("!suggest "):
        string = clean_content(message.content, "!suggest")
        hobj = hunspell.HunSpell("dictionaries/en_US.dic", "dictionaries/en_US.aff")
        if not hobj.spell(string):
            await message.channel.send(
                'Did you maybe mean "' + hobj.suggest(string)[0] + '"?'
            )
        else:
            await message.channel.send("Seems fine to me.")

    if message.content.startswith("!search "):
        string = clean_content(message.content, "!search")
        hobj = hunspell.HunSpell("dictionaries/en_US.dic", "dictionaries/en_US.aff")
        if not hobj.spell(string):
            data = Search(hobj.suggest(string)[0])
        else:
            data = Search(string)
        await message.channel.send("", embed=data.performSearch())

    if message.content.startswith("!build ") or message.content.startswith("!builds "):
        await message.channel.send(**BuildResponder(message).getReply())

    if message.content.startswith("!skill ") or message.content.startswith("!skills "):
        await message.channel.send(**SkillResponder(message).getReply())

    if message.content.startswith("!github"):
        await message.channel.send("https://github.com/rbridge/discord-divinity-bot")

    if message.content.startswith("!help"):
        await message.channel.send(**HelpResponder(message).getReply())
Beispiel #5
0
def test_map_3(hpk_dic_words, hpk_dic_filepath, sb0n_MAP5_REP10_filepath, sb0n_MAP5_filepath):
    '''
    This passes and takes about 40 mins on the laptop.
    so MAP5 by itself is all that is needed

    Note that when I originally did this test the set up of the file
    MAP5_REP10 had 'REP 20' - This made the file misbehave.
    Need to put in place something to check that the number of REPs is as 
    advertised.
    '''
    hobj_sb0n_MAP5_REP10 = hunspell.HunSpell(hpk_dic_filepath, sb0n_MAP5_REP10_filepath)
    hobj_sb0n_MAP5 = hunspell.HunSpell(hpk_dic_filepath, sb0n_MAP5_filepath)
    for word in hpk_dic_words:
        if not " " in word and not "-" in word:
            word_as_list = list(word)
            shuffle(word_as_list)
            jumbled_word = ''.join(word_as_list)
            if jumbled_word not in hpk_dic_words:
            # Just test non-compound words, its easier
                sb0n_MAP5_REP10_suggestions = [x.decode() for x in \
                                          hobj_sb0n_MAP5_REP10.suggest(jumbled_word)]
                sb0n_MAP5_suggestions = [x.decode() for x in \
                                         hobj_sb0n_MAP5.suggest(jumbled_word)]  
                print(word)
                print(jumbled_word, "sb0n_MAP5_REP10_suggestions", sb0n_MAP5_REP10_suggestions)
                print(jumbled_word, "sb0n_MAP5_suggestions", sb0n_MAP5_suggestions)
                assert sorted(sb0n_MAP5_REP10_suggestions) == sorted(sb0n_MAP5_suggestions)
Beispiel #6
0
def test_encoding(hpk_dic_filepath, \
                  test_aff_set_only_filepath, \
                  test_aff_empty_filepath):

    hobj_no_encoding = hunspell.HunSpell(hpk_dic_filepath,
                                         test_aff_empty_filepath)
    with pytest.raises(ValueError):
        assert hobj_no_encoding.spell("ā") == True
    hobj_encoding = hunspell.HunSpell(hpk_dic_filepath,
                                      test_aff_set_only_filepath)
    assert hobj_encoding.spell("ā") == True
Beispiel #7
0
 def __init__(self, lang='en'):
     if lang == 'en':
         self.hobj = hunspell.HunSpell('/root/hunspell/en_US.dic',
                                       '/root/hunspell/en_US.aff')
     elif lang == 'es':
         self.hobj = hunspell.HunSpell('/root/hunspell/es_ANY.dic',
                                       '/root/hunspell/es_ANY.aff')
     elif lang == 'nl':
         self.hobj = hunspell.HunSpell('/root/hunspell/nl_NL.dic',
                                       '/root/hunspell/nl_NL.aff')
     else:
         raise ValueError('Unsupported language')
def baseline(corpus_file_name):
    dic_es = hunspellES.HunSpell("es.dic",
                                 "es.aff")  #Get Hunspell spanish dictionary
    dic_eu = hunspellEU.HunSpell('eu.dic',
                                 'eu.aff')  #Get Hunspell basque dictionary

    CS_Corpus = open(corpus_file_name, 'rb')
    CS_Reader = csv.reader(CS_Corpus, delimiter=',', quotechar='"')
    CS_Reader.next()  #Skip first line

    count = 0
    right = 0
    total = 0

    y_true = []
    y_pred = []

    for row in CS_Reader:
        row_processed = getTweetTokensTags(row)
        hand_tagged_tags = []
        for token, tag in row_processed:
            hand_tagged_tags.append(tag)

        tokens = Process.tokenize(
            row[2].decode('UTF-8'))  # Tweet text tokenized
        predicted_tags = []
        for i in range(0, len(tokens)):
            t0 = tokens[i]
            if i > 0 and tokens[i - 1] not in [".", "!", "?"] and t0.istitle():
                predicted_tags.append("IE")
            elif t0.isupper():
                predicted_tags.append("IE")
            elif dic_es.spell(t0):
                predicted_tags.append("ES")
            elif dic_eu.spell(t0):
                predicted_tags.append("EUS")
            elif Process.isURL(t0):
                predicted_tags.append("URL")
            elif Process.isID(t0):
                predicted_tags.append("ID")
            else:
                predicted_tags.append("EG")

        y_true.append(hand_tagged_tags)
        y_pred.append(predicted_tags)

    print ""
    print "Sequence item accuracy score: %.5f" % seqItemAccuracyScore(
        y_true, y_pred)
    print "Sequence accuracy score: %.5f" % seqAccuracyScore(y_true, y_pred)
    print "Global tag accuracy score: %.5f" % globalTagAccuracyScore(
        y_true, y_pred)
Beispiel #9
0
    def __init__(self, extra_words=(), **kwargs):
        super(Speller, self).__init__(**kwargs)
        if Speller.large_hspell is None:
            Speller.large_hspell = hunspell.HunSpell(
                os.path.join(config.hunspell_path, 'en_US.dic_large.utf8'),
                os.path.join(config.hunspell_path, 'en_US.aff.utf8'))

        self.hspell = hunspell.HunSpell(
            os.path.join(config.hunspell_path, 'en_US.dic_sw.utf8'),
            os.path.join(config.hunspell_path, 'en_US.aff.utf8'))
        self.extra_words = set(extra_words)
        for w in self.extra_words:
            self.hspell.add(w.lower())
Beispiel #10
0
def test_rep_2(hpk_dic_words, hpk_dic_filepath, sb0n_filepath,
               sb0n_REP11_filepath):
    hobj_sb0n = hunspell.HunSpell(hpk_dic_filepath, sb0n_filepath)
    hobj_sb0n_REP11 = hunspell.HunSpell(hpk_dic_filepath, sb0n_REP11_filepath)
    for word in hpk_dic_words:
        if not " " in word and not "-" in word:
            # Just test non-compound words, its easier
            sb0n_suggestions = [x.decode() for x in hobj_sb0n.suggest(word)]
            sb0n_REP11_suggestions = [x.decode() for x in \
                                      hobj_sb0n_REP11.suggest(word)]
            print(word, "sb0n_suggestions", sb0n_suggestions)
            print(word, "sb0n_REP11_suggestions", sb0n_REP11_suggestions)
            assert sorted(sb0n_suggestions) == sorted(sb0n_REP11_suggestions)
Beispiel #11
0
def brute_rot(*args):
    enc_string = " ".join([x for x in args])
    dec_strings = []
    for key in range(1, 26):
        dec_string = ""
        for c in enc_string:
            if c.isalpha():
                if c.islower():
                    dec_string += chr((ord(c) + key - 97) % 26 + 97)
                else:
                    dec_string += chr((ord(c) + key - 65) % 26 + 65)
            else:
                dec_string += c
        dec_strings.append(dec_string)

    # FIXME: sometimes it ranks non-words as words
    hobj = hunspell.HunSpell('/usr/share/hunspell/en_US.dic',
                             '/usr/share/hunspell/en_US.aff')
    msg_table = []
    for i, dec_string in enumerate(dec_strings):
        curr_rank = 0
        for word in dec_string.split():
            if hobj.spell(word):
                curr_rank += 1
        msg_table.append((curr_rank, i + 1, dec_string))

    msg_table = sorted(msg_table, key=lambda x: x[0], reverse=True)

    formatted_msg_table = "rank ┃ key ┃ decrypted_string\n" + "━━━━━╋━━━━━╋━━━━━━━━━━━━━━━━━" + "\n"
    for rank, key, dec_string in msg_table:
        formatted_msg_table += f"{rank:>4} ┃ {key:>3} ┃ {dec_string}\n"
    return ["```" + formatted_msg_table + "```"]
Beispiel #12
0
def spell(filenames, debug=False):
    if hunspell is None:
        raise ImportError('hunspell is not installed on your system. If you want '
                          'to run `ontutils spell` please run pipenv install --dev --skip-lock. '
                          'You will need the development libs for hunspell on your system.')
    spell_objects = (u for r in Parallel(n_jobs=9)(delayed(get_spells)(f) for f in filenames) for u in r)
    hobj = hunspell.HunSpell('/usr/share/hunspell/en_US.dic', '/usr/share/hunspell/en_US.aff')
    #nobj = hunspell.HunSpell(os.path.expanduser('~/git/domain_wordlists/neuroscience-en.dic'), '/usr/share/hunspell/en_US.aff')  # segfaults without aff :x
    collect = set()
    for filename, s, p, o in spell_objects:
        missed = False
        no = []
        for line in o.split('\n'):
            nline = []
            for tok in line.split(' '):
                prefix, tok, suffix = tokstrip(tok)
                #print((prefix, tok, suffix))
                if not hobj.spell(tok):# and not nobj.spell(tok):
                    missed = True
                    collect.add(tok)
                    nline.append(prefix + tc.red(tok) + suffix)
                else:
                    nline.append(prefix + tok + suffix)
            line = ' '.join(nline)
            no.append(line)
        o = '\n'.join(no)
        if missed:
            #print(filename, s, o)
            print('>>>', o)

    if debug:
        [print(_) for _ in sorted(collect)]
        breakpoint()
Beispiel #13
0
    def __init__(self, langlist, folders):
        """
        langlist - list of the languages ("ru_RU", "en_US", etc)
        """
        logger.debug('Initialize HunspellWrapper spell checker')

        # Key - language (en_US, ru_RU etc),
        # value - onstance of the HunSpell class
        self._checkers = {}

        # Index - number of the dictionary,
        # value - tuple: (key for self._checkers, path to .dic file)
        self._customDicts = []

        dictsFinder = DictsFinder(folders)

        for lang in langlist:
            checker = None

            for path in dictsFinder.getFoldersForLang(lang):
                dic_file = os.path.join(path, lang + '.dic')
                aff_file = os.path.join(path, lang + '.aff')

                if (checker is None and os.path.exists(dic_file)
                        and os.path.exists(aff_file)):
                    checker = hunspell.HunSpell(dic_file, aff_file)
                else:
                    checker.add_dic(dic_file)

                logger.debug('Add dictionary: {}'.format(dic_file))

            if checker is not None:
                self._checkers[lang] = checker
Beispiel #14
0
    def __init__(self, configDictionary):
        super(HunSpelling, self).__init__(configDictionary)
        self.profile = {
                "name" : "hunspelling-module",
                "class": "spelling",
                "supported-languages" : ["de", "en", "tr"]
                } 

        self.logger = logging.getLogger(os.path.basename(sys.argv[0]))
        self.dict_file = utils.getKeyFromSectionInConfiguration('spelling', 'spelling-dict-file', None, configDictionary)
        if self.dict_file == None:
            print('*** Missing spelling-dict-file in configuration. Exiting.')
            sys.exit(1)

        self.aff_file = utils.getKeyFromSectionInConfiguration('spelling', 'spelling-aff-file', None, configDictionary)
        if self.dict_file == None:
            print('*** Missing spelling-aff-file in configuration. Exiting.')
            sys.exit(1)

        self.add_words_file = utils.getKeyFromSectionInConfiguration('spelling', 'training-add-words-from-file', None, configDictionary)

        self.speller = hunspell.HunSpell(self.dict_file, self.aff_file)
        if self.speller == None:
            print('>>>>>> Could not create speller...')
        tokenizer_language = utils.getKeyFromSectionInConfiguration('spelling', 'tokenizer-language', 'german', configDictionary)
        try:
            self.tokenizer = nltk.data.load('tokenizers/punkt/{0}.pickle'.format(tokenizer_language))
        except:
            print('>>>>>> Could not load TOKENIZER language file.')
            sys.exit(1)

        if self.add_words_file != None:
            self.train()
Beispiel #15
0
def stem_with_hunspell(word):
    import hunspell
    huns_obj = hunspell.HunSpell(helper.get_nepali_dict_path(),
                                 helper.get_nepali_rules_path())
    res = huns_obj.stem(word)
    for r in res:
        print(r.decode())
Beispiel #16
0
 def __init__(self):
     index_file = "../../indices/sample_collection_jsonl/"
     self.keyword_util = KeywordSearchUtil(index_file)
     self.semantic_util = SemanticSearchUtil()
     self.hobj = hunspell.HunSpell('/usr/share/hunspell/en_US.dic',
                                   '/usr/share/hunspell/en_US.aff')
     self.index_reader = index.IndexReader(index_file)
Beispiel #17
0
    def __init__(self, script_path, tmx_file):
        """Initialize object"""

        self.verbose = False
        self.tmx_file = tmx_file
        self.translations = {}

        self.script_path = script_path
        self.exceptions_path = os.path.join(script_path, os.path.pardir,
                                            "exceptions")
        self.errors_path = os.path.join(script_path, os.path.pardir, "errors")

        # Set up spellcheckers
        # Load hunspell dictionaries
        dictionary_path = os.path.join(self.script_path, os.path.pardir,
                                       "dictionaries")
        self.spellchecker = hunspell.HunSpell(
            os.path.join(dictionary_path, "it_IT.dic"),
            os.path.join(dictionary_path, "it_IT.aff"),
        )
        self.spellchecker.add_dic(
            os.path.join(dictionary_path, "mozilla_qa_specialized.dic"))

        # Extract strings
        self.extractStrings()

        # Run checks
        self.checkQuotes()
        self.checkSpelling()
Beispiel #18
0
    def spellCorrect(self, origTweet=True):
        '''
        Parameters: None
        Description: Correct spelling mistakes using the hunspell engine
        if origTweet:
        Returns: tokenized tweet
        '''

        if origTweet:
            line = copy.copy(self.tok)
        else:
            line = self.sen

        hspell = hunspell.HunSpell('/usr/share/hunspell/en_US.dic',
                                   '/usr/share/hunspell/en_US.aff')

        #pass it through a spell checker - hunspell
        for i in range(len(line)):
            flag = False
            for c in self.punc:
                if c in line[i]:
                    flag = True
                    break
            if (not hspell.spell(
                    line[i])) and line[i] != "**NAME**" and flag == False:
                line[i] = hspell.suggest(line[i])[0]
        self.sen = line
        return line
Beispiel #19
0
    def __init__(self, server_address, RequestHandlerClass, settings, bind_and_activate=True):
        """Constructor. May be extended, do not override."""

        self.log_path = settings['log']
        self.key_file = settings['key']
        self.cert_file = settings['cert']
        self.allow_ip = IPRange(settings['allow_ip'])

        self.spellchecker = hunspell.HunSpell('/usr/share/hunspell/en_US.dic',
                                              '/usr/share/hunspell/en_US.aff')

        SocketServer.TCPServer.__init__(self, server_address, RequestHandlerClass, False)

        # initialize SSL connection
        self.socket = ssl.wrap_socket(self.socket,
                                      keyfile=self.key_file,
                                      certfile=self.cert_file,
                                      cert_reqs=ssl.CERT_NONE,
                                      ssl_version=ssl.PROTOCOL_TLSv1,
                                      server_side=True)

        # start serving
        if bind_and_activate:
            self.server_bind()
            self.server_activate()
Beispiel #20
0
def search_word_in_dict(word: str, dict: str, morphology: bool = True):
    global logger

    word = word.strip(' \n')
    words = [word]
    if morphology:
        hobj = hunspell.HunSpell('/usr/share/hunspell/en_US.dic',
                                 '/usr/share/hunspell/en_US.aff')
        if hobj.spell(word) and hobj.stem(word):
            words = [b.decode() for b in hobj.stem(word)]
            logger.debug('Get stems: {}.'.format(', '.join(words)))

    builder = IndexBuilder(dict)
    builder.check_build()
    for w in words:
        meanings = builder.mdx_lookup(w, ignorecase=True)
        if not meanings:
            continue
        logger.debug('Find {} meanings of word {} from dictionary {}.'.format(
            len(meanings), w, dict))
        if w != word:
            word = w
        return word, meanings[0]
    logger.debug('Cannot find word {} from dictionary {}.'.format(word, dict))
    return word, None
Beispiel #21
0
def processSpellcheck(doc, directory, spellCheckDict):
    # do spell check
    try:
        hobj = hunspell.HunSpell('/usr/share/hunspell/%s.dic' % spellCheckDict,
                                 '/usr/share/hunspell/%s.aff' % spellCheckDict)
    except:
        print "Error: Dictionary %r not found." % spellCheckDict
        return
    # check wiki for custom dictionary
    update_custom_dictionary(hobj)  # ugly to call it for each document :-/

    file = os.path.join(directory, doc) + ".md"
    print "Doing spell check on %r" % file
    # read md file
    lines = list()
    out_lines = list()
    with open(file, 'r') as f:
        lines = f.readlines()
    for line in lines:
        words = line.strip("\n").split(" ")
        out_words = list()
        for w in words:
            # generate a version of the word that does not contain any non-alpha characters
            w_striped = w.strip(".:;,!?'_-")
            if w_striped.isalpha():
                if not hobj.spell(w_striped):
                    print "Spelling mistake: %r" % w_striped
                    # mark mistake! we need to stick to simple ASCII chars, things like <strike> could break the latex document if they occur, e.g., in headings
                    w = w.replace(w_striped, "??%s??" % w_striped)
            out_words.append(w)
        out_lines.append(" ".join(out_words))

    # write back
    with open(file, 'w') as f:
        f.write("\n".join(out_lines))
Beispiel #22
0
def test_compound_suggestion(hpk_dic_words, hpk_dic_filepath, baseline_aff_filepath):
    hobj = hunspell.HunSpell(hpk_dic_filepath, baseline_aff_filepath)
    for word in hpk_dic_words:
        if " " in word:
            word_with_dashes = word.replace(" ", "-")
            suggestions = [x.decode() for x in hobj.suggest(word_with_dashes)]
            print(word, word_with_dashes, suggestions)
            assert any(x == word for x in suggestions)
Beispiel #23
0
 def __init__(self):
     self.stanfordCoreNLP = StanfordCoreNLP('http://localhost', port=9500)
     self.spacyNLP = spacy.load('en_core_web_sm')
     with open(join(FILE_PATH, "infVocab.json"), "r") as f:
         self.infoVocab = json.load(f)
     self.spellchecker = hunspell.HunSpell('/usr/share/hunspell/en_US.dic',
                                           '/usr/share/hunspell/en_US.aff')
     self.lemmatizer = WordNetLemmatizer()
Beispiel #24
0
def init_spellchecker(corpus):
    """
    Initializes the spell checker.
    It uses the corpus information to choose a proper language for spell checker.
    Returns the initialied spell checker
    """

    if corpus in ["conll03_en", "ontonotes"]:
        spell_check = hunspell.HunSpell('/usr/share/hunspell/en_US.dic',
                                        '/usr/share/hunspell/en_US.aff')
    elif corpus in ["conll03_de", "germeval"]:
        spell_check = hunspell.HunSpell('/usr/share/hunspell/de_DE.dic',
                                        '/usr/share/hunspell/de_DE.aff')
    else:
        spell_check = None

    return spell_check
Beispiel #25
0
    def __init__(self):
        """
        Normalizer: restores original/formal spelling of misspelled/colloquial token, replaces twitter-specific phenomena for tagging

        Normalization requires setup of a dictionary of unigram/bigram occurences (to make the most probable correction)
        - first, run collect_bigrams() on every tweet
        - then, the normalize() function can be used

        For detailed usage, see tests/test_normalizer
        """
        self.dictionary = hunspell.HunSpell(
            "/Users/ulisteinbach/Desktop/SS18/software_projekt/softwareprojekt/autosarkasmus/rsrc/hunspell/de_DE.dic",
            "/Users/ulisteinbach/Desktop/SS18/software_projekt/softwareprojekt/autosarkasmus/rsrc/hunspell/de_DE.aff"
        )
        self.bigrams = defaultdict(lambda: 1.0)  #add-1
        self.unigrams = defaultdict(lambda: 1.0)
        self.emoji_pos = [
            u"\U0001F601", u"\U0001F602", u"\U0001F603", u"\U0001F604",
            u"\U0001F605", u"\U0001F606", u"\U0001F607", u"\U0001F608",
            u"\U0001F609", u"\U0001F60A", u"\U0001F60B", u"\U0001F60C",
            u"\U0001F60D", u"\U0001F60E", u"\U0001F60F", u"\U0001F638",
            u"\U0001F639", u"\U0001F63A", u"\U0001F63B", u"\u263a",
            u"\U0001f61d", u"\u2600", u"\U0001f44d", u"\u2665"
        ]
        self.emoji_pos += [
            u"\U0001f44c", u"\U0001f389", u"\U0001f49c", u"\U0001f499",
            u"\U0001f49b", u"\u2661", u"\U0001f497", u"\U0001f61c"
        ]
        self.emoji_pos += [u"\U0001f498"]
        self.emoji_neg = [
            u"\U0001F612", u"\U0001F61E", u"\U0001F61F", u"\U0001F620",
            u"\U0001F621", u"\U0001F622", u"\U0001F623", u"\U0001F625",
            u"\U0001F627", u"\U0001F628", u"\U0001F62D", u"\U0001F63E",
            u"\U0001F63F", u"\U0001f614", u"\U0001f44e", u"\U0001f616",
            u"\u2639", u"\U0001f494"
        ]
        self.emoji_neg += [u"\U0001f613", u"\U0001f645", u"\U0001f630"]
        self.emoji = [
            u"\U0001f3c3", u"\ue00e", u"\u2614", u"\U0001f4a8", u"\U0001f4a6",
            u"\u2601", u"\U0001f4b0", u"\U0001f341", u"\U0001f631",
            u"\U0001f4a4", u"\U0001f637", u"\U0001f436"
        ]
        self.emoji += [
            u"\U0001f64f", u"\U0001f4fa", u"\u270b", u"\U0001f633",
            u"\U0001f366", u"\U0001f632", u"\U0001f44f", u"\U0001f44a",
            u"\U0001f4a2", u"\U0001f497", u"\U0001f631"
        ]
        self.emoji += [
            u"\U0001f37a", u"\U0001f37b", u"\U0001f52b", u"\U0001f378",
            u"\U0001f48a", u"\U0001f483", u"\U0001f487", u"\U0001f4aa",
            u"\U0001f41f"
        ]
        self.special_tags = [
            "%HASHTAG%", "%MENTION%", "%SMILEYPOS%", "%SMILEYNEG%", "%SMILEY%",
            "%URL%", ",", ".", "!", "?", ":", ";", "-", "+++", "–", "\"", "|"
        ]
        self.tokens = 0
        self._cache_spelling = {}
Beispiel #26
0
def test_break_0(hpk_dic_filepath, test_aff_break_0_only_filepath):
    # This shows the importance of setting BREAK 0 in the .aff file
    # The BREAK 0 results in 'new' words being marked WRONG (desired behaviour)
    # in this case "awa-kai" and "kai-awa"
    hobj = hunspell.HunSpell(hpk_dic_filepath, test_aff_break_0_only_filepath)
    assert hobj.spell("awa") == True
    assert hobj.spell("kai") == True
    assert hobj.spell("awa-kai") == False
    assert hobj.spell("kai-awa") == False
Beispiel #27
0
def test_break_default(hpk_dic_filepath, test_aff_empty_filepath):
    # This shows the importance of setting BREAK 0 in the .aff file
    # The default results in 'new' words being marked ok
    # in this case "awa-kai" and "kai-awa"
    hobj = hunspell.HunSpell(hpk_dic_filepath, test_aff_empty_filepath)
    assert hobj.spell("awa") == True
    assert hobj.spell("kai") == True
    assert hobj.spell("awa-kai") == True
    assert hobj.spell("kai-awa") == True
Beispiel #28
0
def hunspellTest():
    import hunspell
    spellchecker = hunspell.HunSpell('/usr/share/hunspell/en_US.dic',
                                     '/usr/share/hunspell/en_US.aff')
    print(spellchecker.spell("?"))
    spellchecker.add("Getter")
    print(spellchecker.spell("Getter-"))  # True
    suggestions = spellchecker.suggest("Private")
    print(suggestions)
Beispiel #29
0
    def __init__(self,
                 dic_file='/usr/share/hunspell/hu_HU.dic',
                 aff_file='/usr/share/hunspell/hu_HU.aff',
                 task='dstem',
                 source_fields=None,
                 target_fields=None):
        """
        The initialisation of the module. One can extend the lsit of parameters as needed. The mandatory fields which
         should be set by keywords are the following:
        :param task: the task to specialise the current instance
        :param source_fields: the set of names of the input fields
        :param target_fields: the list of names of the output fields in generation order
        """
        # TODO: Heroku workaround for issue https://github.com/heroku/heroku-buildpack-apt/issues/35
        import os.path
        if not os.path.exists(dic_file):
            dic_file = '/app/.apt/usr/share/hunspell/hu_HU.dic'
            aff_file = '/app/.apt/usr/share/hunspell/hu_HU.aff'
            if not os.path.exists(dic_file):  # TODO: Alpine Linux workaround
                dic_file = os.path.join(
                    os.path.dirname(os.path.abspath(__file__)),
                    'dicts/hu_HU.dic')
                aff_file = os.path.join(
                    os.path.dirname(os.path.abspath(__file__)),
                    'dicts/hu_HU.aff')

        # Specialise the class for eg. stemming or detailed output...
        available_tasks = {
            'spell': self._do_spell,
            'stem': self._do_stem,
            'analyze': self._do_analyze,
            'dstem': self._do_dstem
        }
        for keyword, key_fun in available_tasks.items():
            if task == keyword:
                self.process_token = key_fun
                break
        else:
            raise ValueError(
                'No proper task is specified. The available tasks are {0}'.
                format(' or '.join(available_tasks.keys())))

        # Field names for xtsv (the code below is mandatory for an xtsv module)
        if source_fields is None:
            source_fields = set()

        if target_fields is None:
            target_fields = []

        self.source_fields = source_fields
        self.target_fields = target_fields

        self.h = hunspell.HunSpell(dic_file, aff_file)
        self._added_words = set()
        self._removed_words = set()
        self._added_words_w_affix = {}
def hunSpellcheck(language_country, word):
    import hunspell
    if not language_country in hunSpell_factory:
        hunSpell = hunspell.HunSpell('%s%s.dic'%(DICT_PATH, language_country),
                                     '%s%s.aff'%(DICT_PATH, language_country))
        hunSpell_factory[language_country] = hunSpell
    else:
        hunSpell = hunSpell_factory[language_country] 
    
    return hunSpell.spell(word)