コード例 #1
0
    def test_clear_caches_persistance(self):
        temp_dir = tempfile.mkdtemp()
        try:
            h1 = Hunspell('test',
                hunspell_data_dir=DICT_DIR,
                disk_cache_dir=temp_dir,
                cache_manager='disk_hun')
            test_suggest = h1.suggest('testing')
            test_stem = h1.stem('testing')

            h1._suggest_cache['made-up'] = test_suggest
            self.assertEqual(h1.suggest('made-up'), test_suggest)
            h1._stem_cache['made-up'] = test_stem
            self.assertEqual(h1.stem('made-up'), test_stem)

            h1.save_cache()
            h1.clear_cache()
            del h1

            cacheman = get_cache_manager('disk_hun')
            cacheman.deregister_all_caches()
            self.assertEqual(len(cacheman.cache_by_name), 0)

            h2 = Hunspell('test',
                hunspell_data_dir=DICT_DIR,
                disk_cache_dir=temp_dir,
                cache_manager='disk_hun')

            self.assertEqual(len(h2._suggest_cache), 0)
            self.assertEqual(len(h2._stem_cache), 0)
            self.assertNotEqual(h2.suggest('made-up'), test_suggest)
            self.assertNotEqual(h2.stem('made-up'), test_stem)
        finally:
            shutil.rmtree(temp_dir) # Nuke temp content
コード例 #2
0
def make_checker():
    '''
    creates a checker depending on the system running
    :return: Hunspell object h
    '''
    if platform.system() == 'Windows':
        h = Hunspell('de_DE_frami', hunspell_data_dir="C:\\Users\\Lena_Langholf\\Dropbox\\Spell_Checking\\dictionaries")
    else:
        h = Hunspell('de_DE_frami', hunspell_data_dir="/home/lena/Desktop/million_post_corpus/dictionaries")
    return h
コード例 #3
0
 def test_bad_path_encoding(self, *mocks):
     if PY3:
         with self.assertRaises(HunspellFilePathError):
             Hunspell('not_checked',
                 hunspell_data_dir=u'bad/\udcc3/decoding')
     else:
         # Python 2 just make an illegal string instead of raising
         with captured_c_stderr_file() as caperr:
             Hunspell('not_checked',
                 hunspell_data_dir=u'bad/\udcc3/decoding')
             with open(caperr, 'r') as err:
                 self.assertRegexpSearch(err.read(), r'error:[^\n]*bad/[^\n]*/decoding')
コード例 #4
0
def spell_corrector(df, lang1, lang2):
    #Create an object of the Hunspell class
    h = Hunspell()
    print('I am spell_checker')
    #An empty list to hold the corrected sentences which would later be made into a dataframe
    corr_sent_list = {'L1': [], 'L2': []}
    #For each sentence in the dataframe
    for sent in df['L1']:

        #Empty string to which the corrected words are appended
        corr_sent = ''
        #For every word in the sentence. Which is split by word boundary
        for w in re.split(r'\b', sent):
            #If the split part is not a word (punctuation marks, spaces) or if it is a correct word, append it to corr_sent
            if not w.isalpha() or h.spell(w):
                corr_sent += w
            #If the split part is word and is incorrect
            else:
                #Suggest possible correct candidates to the incorrect word
                suggest = h.suggest(w)
                #If more than one word is suggested, more processing is required to select a word
                if len(suggest) > 1:
                    #TODO : Parse the list and find the n-gram probability to find the best candidate. For now it just appends the first word
                    corr_sent += suggest[0]
                #If only one word is suggested, append it to corr_sent
                else:
                    corr_sent += suggest[0]
        #When all the words in the sentence is traversed, append the corrected_sentence to corr_sent_list
        corr_sent_list['L1'].append(corr_sent)
    #Convert the corrected sentences list into pandas dataframe to return
    if lang2 is not None:
        corr_sent_list['L2'].extend(list(df['L2']))
        return pd.DataFrame.from_dict(corr_sent_list)
    else:
        return pd.DataFrame(corr_sent_list['L1'], columns=['L1'])
コード例 #5
0
ファイル: lmgec.py プロジェクト: blcuicall/gec-data-synthesis
def loadResources(args):
    # Get base working directory.
    basename = os.path.dirname(os.path.realpath(__file__))
    # Language model built by KenLM: https://github.com/kpu/kenlm
    lm = kenlm.Model(args.model)
    # Load spaCy
    nlp = spacy.load("en")
    # Hunspell spellchecker: https://pypi.python.org/pypi/CyHunspell
    # CyHunspell seems to be more accurate than Aspell in PyEnchant, but a bit slower.
    gb = Hunspell("en_GB-large",
                  hunspell_data_dir=basename + '/resources/spelling/')
    # Inflection forms: http://wordlist.aspell.net/other/
    gb_infl = loadWordFormDict(basename +
                               "/resources/agid-2016.01.19/infl.txt")
    # List of common determiners
    det = {"", "the", "a", "an"}
    # List of common prepositions
    prep = {
        "", "about", "at", "by", "for", "from", "in", "of", "on", "to", "with"
    }
    # Save the above in a dictionary:
    res_dict = {
        "lm": lm,
        "nlp": nlp,
        "gb": gb,
        "gb_infl": gb_infl,
        "det": det,
        "prep": prep
    }
    return res_dict
コード例 #6
0
 def test_windows_utf_8_encoding_applies_prefix(self, *mocks):
     with captured_c_stderr_file() as caperr:
         with patch("os.name", 'nt'):
             # If python file existance checks used prefix, this would raise a HunspellFilePathError
             Hunspell('test', system_encoding='UTF-8')
         with open(caperr, 'r') as err:
             # But the Hunspell library lookup had the prefix applied
             self.assertRegexpSearch(err.read(), r'error:[^\n]*/not/valid[^\n]*')
コード例 #7
0
    def __init__(self, dialect, script):

        self.dialect = dialect
        self.script = script 

        self.hunspell_flags = {"po": "pos", "is": "description", "ts": "terminal_suffix", "ds": "formation"}
        if self.dialect == "Sorani" and self.script == "Arabic":
            self.huns = Hunspell("ckb-Arab", hunspell_data_dir=klpt.get_data("data/"))
        else:
            if not (self.dialect == "Kurmanji" and self.script == "Latin"):
                raise Exception("Sorry, only Sorani dialect in the Arabic script is supported now. Stay tuned for other dialects and scripts!")
コード例 #8
0
 def __init__(self,
              language='en_US',
              hunspell_data_dir='./hunspell',
              n_jobs=1):
     SpellChecker.get_dict(language, hunspell_data_dir)
     self.hunspell = Hunspell(language,
                              hunspell_data_dir=hunspell_data_dir,
                              disk_cache_dir=os.path.join(
                                  hunspell_data_dir, 'cache'))
     self.hunspell.set_concurrency(n_jobs)
     self.substitutes = dict()
コード例 #9
0
    def test_non_overlapping_caches(self):
        test_suggest = self.h.suggest('testing')
        test_stem = self.h.stem('testing')

        self.h._suggest_cache['made-up'] = test_suggest
        self.assertEqual(self.h.suggest('made-up'), test_suggest)
        self.h._stem_cache['made-up'] = test_stem
        self.assertEqual(self.h.stem('made-up'), test_stem)

        h2 = Hunspell('en_US', hunspell_data_dir=DICT_DIR)
        self.assertNotEqual(h2.suggest('made-up'), test_suggest)
        self.assertNotEqual(h2.stem('made-up'), test_stem)
コード例 #10
0
    def run(self) -> Generator[Tuple[int, int, str, type], None, None]:
        """Run the linter and return a generator of errors."""
        with open(self.filename, 'r') as file:
            comments = get_comments(file.read())

        # for comment in comments
        z = list(comments)

        spell = Hunspell()
        x = spell.spell(z[1][2][0])
        print(x)
        yield (0, 0, f'KOL001 Bad language found: ', TypoChecker)
コード例 #11
0
def new_hunspell_nl() -> Hunspell:

    dictionary_path = __resolve_path("../dict/")
    hnspl = Hunspell("nl-nl", hunspell_data_dir=str(dictionary_path))

    # add words that are not present in current dictionary
    for list in [get_plural_nouns(), get_basic_words()]:
        for word in list:
            if not hnspl.spell(word):
                hnspl.add(word)

    return hnspl
コード例 #12
0
    def hunspell(self) -> Hunspell:
        """
        Returns the (cached) Hunspell instance
        """

        if not self._hunspell:
            self._hunspell = Hunspell(
                self.lang.get_hunspell_dict_name(),
                hunspell_data_dir=self.hunspell_data_dir,
            )

        return self._hunspell
コード例 #13
0
    def test_overlapping_caches(self):
        test_suggest = self.h.suggest('testing')
        test_stem = self.h.stem('testing')

        self.h._suggest_cache['made-up'] = test_suggest
        self.assertEqual(self.h.suggest('made-up'), test_suggest)
        self.h._stem_cache['made-up'] = test_stem
        self.assertEqual(self.h.stem('made-up'), test_stem)

        del self.h
        self.h = Hunspell('test', hunspell_data_dir=DICT_DIR)
        self.assertEqual(self.h.suggest('made-up'), test_suggest)
        self.assertEqual(self.h.stem('made-up'), test_stem)
コード例 #14
0
ファイル: spellers.py プロジェクト: GarryGaller/nlp_toolkit
    def __init__(
        self,
        lang='en',
        max_dist=2,
        cpu=os.cpu_count(),
        # cache_manager="hunspell",disk_cache_dir=None,
        # hunspell_data_dir=None,system_encoding=None
        spell_kwargs={}):

        self.lang = self.langs.get(lang, lang)
        self.spell_dict = Hunspell(self.lang, **spell_kwargs)
        self.max_dist = max_dist
        self.spell_dict.set_concurrency(cpu)
コード例 #15
0
ファイル: hunspell_test.py プロジェクト: ulwan/sihunspell_id
def test_clear_caches_persistance(hunspell):
    temp_dir = tempfile.mkdtemp()
    try:
        h1 = Hunspell('test',
                      hunspell_data_dir=DICT_DIR,
                      disk_cache_dir=temp_dir,
                      cache_manager='disk_hun')
        test_suggest = h1.suggest('testing')
        test_suffix = h1.suffix_suggest('testing')
        test_stem = h1.stem('testing')

        h1._suggest_cache['made-up'] = test_suggest
        assert h1.suggest('made-up') == test_suggest
        h1._suffix_cache['made-up'] = test_suffix
        assert h1.suffix_suggest('made-up') == test_suffix
        h1._stem_cache['made-up'] = test_stem
        assert h1.stem('made-up') == test_stem

        h1.save_cache()
        h1.clear_cache()
        del h1

        cacheman = get_cache_manager('disk_hun')
        cacheman.deregister_all_caches()
        assert len(cacheman.cache_by_name) == 0

        h2 = Hunspell('test',
                      hunspell_data_dir=DICT_DIR,
                      disk_cache_dir=temp_dir,
                      cache_manager='disk_hun')

        assert len(h2._suggest_cache) == 0
        assert len(h2._stem_cache) == 0
        assert h2.suggest('made-up') != test_suggest
        assert h2.suffix_suggest('made-up') != test_suffix
        assert h2.stem('made-up') != test_stem
    finally:
        shutil.rmtree(temp_dir)  # Nuke temp content
コード例 #16
0
ファイル: __init__.py プロジェクト: zhanglipku/mediacloud
    def __init__(self):
        """Constructor."""
        super().__init__()

        self.__treebank_tokenizer = TreebankWordTokenizer()

        hunspell_dict_dir = os.path.join(
            os.path.dirname(os.path.abspath(__file__)),
            'hindi-hunspell',
            'dict-hi_IN',
        )
        if not os.path.isdir(hunspell_dict_dir):
            raise McLanguageException(
                "Hunspell dictionary directory does not exist at path: %s." %
                hunspell_dict_dir)

        if not os.path.isfile(os.path.join(hunspell_dict_dir, 'hi_IN.dic')):
            raise McLanguageException(
                "Hunspell dictionary file does not exist at path: %s" %
                hunspell_dict_dir)
        if not os.path.isfile(os.path.join(hunspell_dict_dir, 'hi_IN.aff')):
            raise McLanguageException(
                "Hunspell affix file does not exist at path: %s" %
                hunspell_dict_dir)

        try:
            self.__hindi_hunspell = Hunspell(
                lang='hi_IN', hunspell_data_dir=hunspell_dict_dir)
        except Exception as ex:
            raise McLanguageException(
                "Unable to initialize Hunspell with data directory '%s': %s" %
                (
                    hunspell_dict_dir,
                    str(ex),
                ))

        # Quick self-test to make sure that Hunspell is installed and dictionary is available
        hunspell_exc_message = """
            Hunspell self-test failed; make sure that Hunspell is installed and dictionaries are accessible, e.g.
            you might need to fetch Git submodules by running:

                git submodule update --init --recursive
        """
        try:
            test_stems = self.stem_words(['गुरुओं'])
        except Exception as _:
            raise McLanguageException(hunspell_exc_message)
        else:
            if len(test_stems) == 0 or test_stems[0] != 'गुरु':
                raise McLanguageException(hunspell_exc_message)
コード例 #17
0
    def test_clear_caches_non_peristance(self):
        test_suggest = self.h.suggest('testing')
        test_stem = self.h.stem('testing')

        self.h._suggest_cache['made-up'] = test_suggest
        self.assertEqual(self.h.suggest('made-up'), test_suggest)
        self.h._stem_cache['made-up'] = test_stem
        self.assertEqual(self.h.stem('made-up'), test_stem)

        self.h.clear_cache()

        del self.h
        self.h = Hunspell('test', hunspell_data_dir=DICT_DIR)
        self.assertNotEqual(self.h.suggest('made-up'), test_suggest)
        self.assertNotEqual(self.h.stem('made-up'), test_stem)
コード例 #18
0
ファイル: hunspell_test.py プロジェクト: ulwan/sihunspell_id
def test_non_overlapping_caches(hunspell):
    test_suggest = hunspell.suggest('testing')
    test_suffix = hunspell.suffix_suggest('testing')
    test_stem = hunspell.stem('testing')

    hunspell._suggest_cache['made-up'] = test_suggest
    assert hunspell.suggest('made-up') == test_suggest
    hunspell._suffix_cache['made-up'] = test_suffix
    assert hunspell.suffix_suggest('made-up') == test_suffix
    hunspell._stem_cache['made-up'] = test_stem
    assert hunspell.stem('made-up') == test_stem

    h2 = Hunspell('en_US', hunspell_data_dir=DICT_DIR)
    assert h2.suggest('made-up') != test_suggest
    assert h2.stem('made-up') != test_stem
コード例 #19
0
 def test_hunspell_bulk_stem(self):
     d = Hunspell('en_US', hunspell_data_dir=DICT_DIR)
     self.assertDictEqual(d.bulk_action("stem", ['dog', 'permanently']), {
         'permanently': ['permanent'],
         'dog': ['dog']
     })
     self.assertDictEqual(
         d.bulk_action("stem",
                       ['dog', 'twigs', 'permanently', 'unrecorded']), {
                           'unrecorded': ['recorded'],
                           'permanently': ['permanent'],
                           'twigs': ['twig'],
                           'dog': ['dog']
                       })
     del d
コード例 #20
0
ファイル: hunspell_test.py プロジェクト: ulwan/sihunspell_id
def test_clear_caches_non_peristance(hunspell):
    test_suggest = hunspell.suggest('testing')
    test_suffix = hunspell.suffix_suggest('testing')
    test_stem = hunspell.stem('testing')

    hunspell._suggest_cache['made-up'] = test_suggest
    assert hunspell.suggest('made-up') == test_suggest
    hunspell._suffix_cache['made-up'] = test_suffix
    assert hunspell.suffix_suggest('made-up') == test_suffix
    hunspell._stem_cache['made-up'] = test_stem
    assert hunspell.stem('made-up') == test_stem

    hunspell.clear_cache()

    del hunspell
    hunspell = Hunspell('test', hunspell_data_dir=DICT_DIR)
    assert hunspell.suggest('made-up') != test_suggest
    assert hunspell.suffix_suggest('made-up') != test_suffix
    assert hunspell.stem('made-up') != test_stem
コード例 #21
0
 def test_hunspell_bulk_suggest(self):
     d = Hunspell('en_US', hunspell_data_dir=DICT_DIR)
     self.assertDictEqual(
         d.bulk_action("suggest", ['dog', 'dpg']), {
             'dpg': [
                 'dog', 'pg', 'deg', 'dig', 'dpt', 'dug', 'mpg', 'd pg',
                 'GDP'
             ],
             'dog': ['dog']
         })
     self.assertDictEqual(
         d.bulk_action("suggest", [
             'dog', 'dpg', 'pgg', 'opg', 'dyg', 'frg', 'twg', 'bjn', 'foo',
             'qre'
         ]), {
             'pgg': ['pg', 'peg', 'egg', 'pig', 'pug', 'pkg', 'pg g', 'PG'],
             'foo': [
                 'few', 'goo', 'fop', 'foot', 'fool', 'food', 'foe', 'for',
                 'fro', 'too', 'fol', 'coo', 'fog', 'moo', 'fob'
             ],
             'frg': [
                 'fr', 'frig', 'frog', 'erg', 'fig', 'f*g', 'fro', 'fog',
                 'fry', 'fr g'
             ],
             'twg': ['twig', 'tag', 'two', 'tog', 'tug', 'twp'],
             'bjn': ['bin', 'ban', 'bun', 'Bjorn'],
             'dog': ['dog'],
             'dpg': [
                 'dog', 'pg', 'deg', 'dig', 'dpt', 'dug', 'mpg', 'd pg',
                 'GDP'
             ],
             'opg': [
                 'op', 'pg', 'ope', 'ops', 'opt', 'mpg', 'opp', 'o pg',
                 'op g', 'GPO'
             ],
             'dyg': ['dug', 'dye', 'deg', 'dig', 'dog', 'dying'],
             'qre': [
                 'qr', 're', 'ere', 'ire', 'are', 'ore', 'Ore', 'Dre',
                 'q re', 'qr e'
             ]
         })
     del d
コード例 #22
0
 def leetScan(string, valDict, language="EN"):
     leetcandidates = []
     count = 0
     h = Hunspell('en_US', hunspell_data_dir='/Library/Spelling')
     tokens = nltk.word_tokenize(string)
     # Calculate Total Words in string
     total_words = len(tokens)
     for token in tokens:
         # Check for misspelling
         if h.spell(token) == False:
             # See if word contains leet
             if leetCheck(token):
                 # Add to possible candidate list
                 leetcandidates.append(token)
     # Test candidate list for word validity using swapping
     for candidate in leetcandidates:
         if swapValid(candidate, valDict, h):
             count = count + 1
     fraction = Fraction(count, total_words)
     return fraction
コード例 #23
0
ファイル: lmgec.py プロジェクト: sai-prasanna/lmgec-lite
 def __init__(self, threshold=0.96):
     basename = os.path.dirname(os.path.realpath(__file__))
     self.lm = LanguageModel()
     # Load spaCy
     self.nlp = spacy.load("en")
     # Hunspell spellchecker: https://pypi.python.org/pypi/CyHunspell
     # CyHunspell seems to be more accurate than Aspell in PyEnchant, but a bit slower.
     self.gb = Hunspell("en_GB-large",
                        hunspell_data_dir=basename + '/resources/spelling/')
     # Inflection forms: http://wordlist.aspell.net/other/
     self.gb_infl = loadWordFormDict(basename +
                                     "/resources/agid-2016.01.19/infl.txt")
     # List of common determiners
     self.determiners = {"", "the", "a", "an"}
     # List of common prepositions
     self.prepositions = {
         "", "about", "at", "by", "for", "from", "in", "of", "on", "to",
         "with"
     }
     self.threshold = threshold
コード例 #24
0
from hunspell import Hunspell
from unidecode import unidecode
from nltk.tokenize import sent_tokenize, word_tokenize
from nltk.metrics.distance import edit_distance
import string
import zipfile
import re
from langdetect import detect
from functools import reduce
    
hunsp = Hunspell('Spanish')
table = str.maketrans('', '', string.punctuation)

def is_not_spanish_line(line):
    try:
        return detect(line) != 'es'
    except:
        return True

def correct_hyphen(lines):
    for line_index, line in enumerate(lines):
        if line != []:
            last_word = line[-1].strip()
            last_char = last_word[-1]
            if last_char == '-' and  line_index+1 < len(lines):
                next_line = lines[line_index+1]
                if len(next_line) > 0:
                    first_word_next_line = next_line[0]
                    first_word_clean_next_line = first_word_next_line.translate(table)
                    #if hunsp.spell(last_word[:-1]+first_word_clean_next_line):
                    if True:
コード例 #25
0
    def __init__(self):

        self.hs = Hunspell('en_US')
        self.vs = cv2.VideoCapture(0)
        self.current_image = None
        self.current_image2 = None
        self.json_file = open("Models\model_new.json", "r")
        self.model_json = self.json_file.read()
        self.json_file.close()

        self.loaded_model = model_from_json(self.model_json)
        self.loaded_model.load_weights("Models\model_new.h5")

        self.json_file_dru = open("Models\model-bw_dru.json", "r")
        self.model_json_dru = self.json_file_dru.read()
        self.json_file_dru.close()

        self.loaded_model_dru = model_from_json(self.model_json_dru)
        self.loaded_model_dru.load_weights("Models\model-bw_dru.h5")
        self.json_file_tkdi = open("Models\model-bw_tkdi.json", "r")
        self.model_json_tkdi = self.json_file_tkdi.read()
        self.json_file_tkdi.close()

        self.loaded_model_tkdi = model_from_json(self.model_json_tkdi)
        self.loaded_model_tkdi.load_weights("Models\model-bw_tkdi.h5")
        self.json_file_smn = open("Models\model-bw_smn.json", "r")
        self.model_json_smn = self.json_file_smn.read()
        self.json_file_smn.close()

        self.loaded_model_smn = model_from_json(self.model_json_smn)
        self.loaded_model_smn.load_weights("Models\model-bw_smn.h5")

        self.ct = {}
        self.ct['blank'] = 0
        self.blank_flag = 0

        for i in ascii_uppercase:
            self.ct[i] = 0

        print("Loaded model from disk")

        self.root = tk.Tk()
        self.root.title("Sign Language To Text Conversion")
        self.root.protocol('WM_DELETE_WINDOW', self.destructor)
        self.root.geometry("900x900")

        self.panel = tk.Label(self.root)
        self.panel.place(x=100, y=10, width=580, height=580)

        self.panel2 = tk.Label(self.root)  # initialize image panel
        self.panel2.place(x=400, y=65, width=275, height=275)

        self.T = tk.Label(self.root)
        self.T.place(x=60, y=5)
        self.T.config(text="Sign Language To Text Conversion",
                      font=("Courier", 30, "bold"))

        self.panel3 = tk.Label(self.root)  # Current Symbol
        self.panel3.place(x=500, y=540)

        self.T1 = tk.Label(self.root)
        self.T1.place(x=10, y=540)
        self.T1.config(text="Character :", font=("Courier", 30, "bold"))

        self.panel4 = tk.Label(self.root)  # Word
        self.panel4.place(x=220, y=595)

        self.T2 = tk.Label(self.root)
        self.T2.place(x=10, y=595)
        self.T2.config(text="Word :", font=("Courier", 30, "bold"))

        self.panel5 = tk.Label(self.root)  # Sentence
        self.panel5.place(x=350, y=645)

        self.T3 = tk.Label(self.root)
        self.T3.place(x=10, y=645)
        self.T3.config(text="Sentence :", font=("Courier", 30, "bold"))

        self.T4 = tk.Label(self.root)
        self.T4.place(x=250, y=690)
        self.T4.config(text="Suggestions :",
                       fg="red",
                       font=("Courier", 30, "bold"))

        self.bt1 = tk.Button(self.root,
                             command=self.action1,
                             height=0,
                             width=0)
        self.bt1.place(x=26, y=745)

        self.bt2 = tk.Button(self.root,
                             command=self.action2,
                             height=0,
                             width=0)
        self.bt2.place(x=325, y=745)

        self.bt3 = tk.Button(self.root,
                             command=self.action3,
                             height=0,
                             width=0)
        self.bt3.place(x=625, y=745)

        self.str = ""
        self.word = " "
        self.current_symbol = "Empty"
        self.photo = "Empty"
        self.video_loop()
コード例 #26
0
import spacy
nlp = spacy.load('en_core_web_md', parsed=False)

from hunspell import Hunspell
h = Hunspell()

import numpy as np
from scipy import spatial
from sklearn.cluster import KMeans
from sklearn.cluster import AffinityPropagation
from PIL import ImageFont, ImageDraw

import networkx as nx
from fa2 import ForceAtlas2
import matplotlib.pyplot as plt


from word import Word

class PreProcessing():
    #words are user input per line (e.g. keyword1)
    def __init__(self, stop_words_file = 'stop_words.txt', negation_stop_words_file = 'negation_stop_words.txt', max_font_size=120, min_font_size=30, min_characters=3, min_words_cluster=300, font="Verdana"):
        self.corpus = [] #raw data that need to be treated for individual submissions
        self.font = font
        self.words = [] # words of each iteration
        self.stop_words = [w.strip('\n').lower() for w in open(stop_words_file).readlines()]
        self.neg_stop_words = [w.strip('\n').lower() for w in open(negation_stop_words_file).readlines()]
        self.max_font_size = max_font_size
        self.min_font_size = min_font_size
        self.min_characters = min_characters
        self.entites_freq = {} # somewhat temp?
コード例 #27
0
from hunspell import Hunspell

h = Hunspell("ko", hunspell_data_dir='ko')

if __name__ == "__main__":
    answer = h.spell("안녕하세요으")

    print(answer)
    answer2 = h.spell("안녕하세")
    print(answer2)

    answer3 = h.suggest("안녕하세요으")
    print(answer3)
コード例 #28
0
ファイル: actions.py プロジェクト: leonardoaii71/starter-pack
# coding=utf-8
import locale
from datetime import datetime
import pytz
from rasa_core_sdk import Action, Tracker
from rasa_core_sdk.events import SlotSet, UserUtteranceReverted
from pymongo import MongoClient
from rasa_core_sdk.forms import FormAction, EntityFormField, FormField
from Database import Database
from hunspell import Hunspell

hsp = Hunspell('es_ANY')
# collection = Database('kb', 'Calendarios').collection

# Actions Eventos


class ActionLogIn(FormAction):
    RANDOMIZE = False

    @staticmethod
    def required_fields():
        return [
            EntityFormField("matricula", "matricula"),
            EntityFormField("password", "password"),
        ]

    def name(self):
        return 'action_login_form'

    def submit(self, dispatcher, tracker, domain):
コード例 #29
0
 def __init__(self):
     self.checker = Hunspell()
コード例 #30
0
ファイル: common.py プロジェクト: WangShengNEU/textpreprocess
 def refresh_dict(self):
     """
     Create a new Hunspell object from the specified dictionary file.
     """
     self.hunspell = Hunspell('index',
                              hunspell_data_dir=self.dictionary_directory)