Esempio n. 1
0
    def addCustomDict(self, customDictPath: str):
        logger.debug('Add custom dictionary: {}'.format(customDictPath))

        dic_folder = os.path.dirname(customDictPath)
        dic_file_name = os.path.basename(customDictPath)
        dic_name = dic_file_name[:-4]
        dic_file = customDictPath
        aff_file = customDictPath[:-4] + '.aff'

        key = (dic_name, customDictPath)
        if key in self._checkers:
            logger.debug('Dictionary already added: {}'.format(customDictPath))
            return

        try:
            create_new_dic_file(dic_file)
            create_new_aff_file(aff_file)
            fix_dic_file(dic_file)
            checker = hunspell.Hunspell(
                dic_name,
                hunspell_data_dir=dic_folder,
                system_encoding='UTF-8')
            self._checkers[key] = checker
            self._customDicts.append((key, dic_file))
        except IOError:
            logger.error(
                "Can't create custom dictionary: {}".format(customDictPath))
Esempio n. 2
0
    def __init__(self, langlist: List[str], folders: List[str]):
        """
        langlist - list of the languages ("ru_RU", "en_US", etc)
        """
        logger.debug('Initialize HunspellWrapper spell checker')

        # Key - language (en_US, ru_RU etc),
        # value - onstance of the HunSpell class
        self._checkers = {}

        # Index - number of the dictionary,
        # value - tuple: (key for self._checkers, path to .dic file)
        self._customDicts = []

        dictsFinder = DictsFinder(folders)

        for lang in langlist:
            checker = None

            for path in dictsFinder.getFoldersForLang(lang):
                dic_file = os.path.join(path, lang + '.dic')
                aff_file = os.path.join(path, lang + '.aff')

                if (checker is None and
                        os.path.exists(dic_file) and
                        os.path.exists(aff_file)):
                    checker = hunspell.Hunspell(
                        lang, hunspell_data_dir=path, system_encoding='UTF-8')

                logger.debug('Add dictionary: {}'.format(dic_file))

            if checker is not None:
                self._checkers[lang] = checker
Esempio n. 3
0
    def __init__(self, input_data):
        super().__init__(input_data=input_data)
        self.hunspell = hunspell.Hunspell()

        self.correct_words = []
        self.incorrect_words = []
        self.full_data = [
        ]  # Tuples of [Original-Word, Hunspell-Suggestion, Levenshtein-Distance]
        self.has_done_work = False

        self.tokenizer = TweetTokenizer()
def preprocess_texts_from_given_forum(forum_id, date_from, date_to, filename):
    data_frame = get_texts_and_prepare_data_frame(date_from, date_to, forum_id)
    data_frame = delete_undesired_elements_from_texts(data_frame)
    tokens = data_frame.post.apply(lambda x: nltk.word_tokenize(x))
    hun = hunspell.Hunspell('pl')
    counter = [0]
    tokens_stemmed = tokens.apply(lambda x: correct_writing(hun, x, counter))
    tokens_stemmed = delete_stop_words(tokens_stemmed)
    data_frame.post = tokens_stemmed
    data_frame.to_csv(
        filename, sep=';',
        escapechar='\\')  #addidional save after preparing preprocessing
    return data_frame
Esempio n. 5
0
    def __init__(self,
                 dict_file,
                 aff_file,
                 add_words_file,
                 tokenizer_language,
                 dictionary_dir=None,
                 language_long=None):

        self.logger = logging.getLogger(os.path.basename(sys.argv[0]))
        self.dict_file = dict_file
        if self.dict_file == None:
            print('*** Missing spelling-dict-file in configuration. Exiting.')
            sys.exit(1)

        self.aff_file = aff_file
        if self.dict_file == None:
            print('*** Missing spelling-aff-file in configuration. Exiting.')
            sys.exit(1)

        self.add_words_file = add_words_file

        if sys.platform == 'darwin':
            localdir = os.path.dirname(__file__)
            dict_dir = os.path.join(localdir, dictionary_dir)
            self.speller = hunspell.Hunspell(language_long,
                                             hunspell_data_dir=dict_dir)
        else:
            self.speller = hunspell.HunSpell(self.dict_file, self.aff_file)

        if self.speller == None:
            print('>>>>>> Could not create speller...')

        try:
            self.tokenizer = nltk.data.load(
                'tokenizers/punkt/{0}.pickle'.format(tokenizer_language))
        except:
            print('>>>>>> Could not load TOKENIZER language file.')
            sys.exit(1)

        if self.add_words_file != None:
            self.train()
Esempio n. 6
0
 def check(self, fname):
     """Check file."""
     if os.path.exists(self.exclude_fname):
         patterns = [
             _make_abspath(item) for item in _read_file(self.exclude_fname)
         ]
         if any(fnmatch(fname, pattern) for pattern in patterns):
             return []
     ret = []
     if not self.native:
         # hunspell has trouble with apostrophes and other delimiters out-of-the-box
         words = []
         with open(fname, "r") as fobj:
             for line in fobj:
                 for word in re.split("[^a-zA-Z]", line.strip()):
                     words.append(word)
         with TmpFile(
                 lambda x: x.write(os.linesep.join(words))) as temp_fname:
             stdout, _ = _shcmd(self.cmd + [temp_fname])
         words = sorted(
             list(set([word.strip() for word in stdout if word.strip()])))
         if words:
             ldict = _grep(fname, words)
             for word, lines in [(word, ldict[word]) for word in words]:
                 for lnum in lines:
                     ret.append((lnum, (word, )))
     else:
         spell_obj = hunspell.Hunspell("en_US")
         ret = []
         with open(fname, "r") as fobj:
             for num, line in enumerate(fobj):
                 line = line.strip()
                 for word in re.split("[^a-zA-Z]", line):
                     if (not spell_obj.spell(word)) and (
                             word not in self.whitelist):
                         ret.append((num + 1, (word, )))
     return ret
FILE_NAME = "markov.csv"

df = pd.read_csv(PATH + FILE_NAME)

all_words = []

for index, row in df.iterrows():
    all_words.append(row['hunspell_errors'])

flat_list = "".join([item for sublist in all_words for item in sublist]).replace("][", ",").replace('''"''', '\'').replace("','", "', '")

print('"' in flat_list)

splitlist = flat_list[2:].split("\', \'")

hs = hunspell.Hunspell()

tuples = []

from tqdm import tqdm
for word in tqdm(splitlist, desc="Loading Suggestions"):
    suggestions = hs.suggest(word)
    if(len(suggestions) > 0 and word != "<END>"):
        tuples.append([word, suggestions[0]])

print(tuples)

from Levenshtein import _levenshtein
def calc_levenshtein(tup):
    return _levenshtein.distance(tup[0], tup[1])
Esempio n. 8
0
import hunspell

dic_any = hunspell.Hunspell("es_ANY")
dic_es = hunspell.Hunspell("es_ES", "es_ES")
dic_en = hunspell.Hunspell("en_US", "en_US")

res = dic_any.spell("análisis")


def check_spell(words):
    count = 0
    count_en = 0
    if len(words) == 0:
        return 1
    for word in words:
        correct_es = dic_any.spell(word) or dic_es.spell(word)
        count = count + 1 if correct_es else count
        if not correct_es:
            correct_en = dic_any.spell(word)
            count_en = count_en + 1 if correct_en else count
    return count / len(words), count_en
Esempio n. 9
0
 def dictionary(self, name, path):
     self.checker = hunspell.Hunspell(name, hunspell_data_dir=path)
     return self
Esempio n. 10
0
from re import finditer

from sklearn import model_selection, preprocessing, linear_model, naive_bayes, metrics, svm
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn import decomposition, ensemble

import pandas, xgboost, numpy, textblob, string
from keras.preprocessing import text, sequence
from keras import layers, models, optimizers

from nltk.tokenize.treebank import TreebankWordDetokenizer

from collections import Counter

LANGUAGE_CODE = 'es'
dictionary = hunspell.Hunspell('es_ANY', hunspell_data_dir="./")

emoji_pattern = re.compile(
    "["
    u"\U0001F600-\U0001F64F"  # emoticons
    u"\U0001F300-\U0001F5FF"  # symbols & pictographs
    u"\U0001F680-\U0001F6FF"  # transport & map symbols
    u"\U0001F1E0-\U0001F1FF"  # flags (iOS)
    u"\U00002702-\U000027B0"
    u"\U000024C2-\U0001F251"
    "]+",
    flags=re.UNICODE)

data_path = "./public_data_development/"
data_path_mint = "/home/nacho/DATASETS/public_data_development/"
parser_dev = ET.XMLParser(encoding='utf-8')
Esempio n. 11
0
import hunspell
import os

curr_dir = os.getcwd()
print(curr_dir)
dir1 = curr_dir + '/dictionary'
print(dir1)
h = hunspell.Hunspell('en_US' , hunspell_data_dir=dir1)
print("Word Suggestions ready")

while(True):
	i = input("String?")
	list1 = h.suggest(i)
	print(list1)
	list2 = []
	for s in list1:
		if s.startswith(i):
			list2.append(s)
	print(list2)
Esempio n. 12
0
 def test_given_correct_word_when_testing_then_true(self):
     word = "konstytucja"
     res = prp.is_correct(word, hun=hunspell.Hunspell('pl'))
     self.assertTrue(res)
Esempio n. 13
0
 def __init__(self):
     ruta_recursos = BASE_DIR + "/res/hunspell-es/"
     print(ruta_recursos)
     #self.dic = hunspell.Hunspell(ruta_recursos + "es_ANY.dic", ruta_recursos + "es_ANY.aff")
     self.dic = hunspell.Hunspell(ruta_recursos + "es_ANY")
Esempio n. 14
0
from nltk.tokenize.treebank import TreebankWordDetokenizer
from collections import Counter

from textacy import keyterms

warnings.filterwarnings(action='ignore', category=UserWarning, module='gensim')
warnings.filterwarnings(action='ignore')

LANGUAGE_CODE = ['es', 'cr', 'mx', 'pe', 'uy']
CROSS_LINGUAL = [True, False]
# CROSS_LINGUAL = [True]
bTestPhase = False  # If we are doing test, then concatenate train + dev, if not use dev as test

dictionary = hunspell.Hunspell(
    'es_ANY',
    hunspell_data_dir="./dictionaries")  # In case you're using CyHunspell
print("Loading Hunspell directory")
# dictionary = hunspell.HunSpell('./Dict/es_ANY.dic', "./Dict/es_ANY.aff")  # In case you're using Hunspell

LABEL_ENCODER = preprocessing.LabelEncoder()
TERNARY_LABEL_ENCODER = preprocessing.LabelEncoder()
data_test_path = "./public_data_task1/"
data_path = "./public_data_development/"
# data_path = "../TASS2019/DATASETS/public_data/"

print("Loading Spacy Model")
lemmatizer = spacy.load(
    "es_core_news_sm")  # GLOBAL to avoid loading the model several times

print("Loading NLTK stuff")
Esempio n. 15
0
    def __init__(self):
        self.directory = 'C:/Users/vyaso/OneDrive/Documents/Sign-Language-to-Text-master/Sign-Language-to-Text-master/model/'
        dict_path = 'C:/Users/vyaso/OneDrive/Documents/Sign-Language-to-Text-master/Sign-Language-to-Text-master/hunspell-master/dicts/en_US/'
        self.hs = hunspell.Hunspell("en_US", hunspell_data_dir=dict_path)
        #self.hs = hunspell.Hunspell('C:/Users/vyaso/OneDrive/Documents/Sign-Language-to-Text-master/Sign-Language-to-Text-master/hunspell-master/dicts/en_US')
        self.vs = cv2.VideoCapture(0 + cv2.CAP_DSHOW)
        self.current_image = None
        self.current_image2 = None
        
        self.json_file = open(self.directory+"model-bw.json", "r")
        self.model_json = self.json_file.read()
        self.json_file.close()
        self.loaded_model = model_from_json(self.model_json)
        self.loaded_model.load_weights(self.directory+"model-bw.h5")

        self.json_file_dru = open(self.directory+"model-bw_dru.json" , "r")
        self.model_json_dru = self.json_file_dru.read()
        self.json_file_dru.close()
        self.loaded_model_dru = model_from_json(self.model_json_dru)
        self.loaded_model_dru.load_weights(self.directory+"model-bw_dru.h5")

        self.json_file_tkdi = open(self.directory+"model-bw_tkdi.json" , "r")
        self.model_json_tkdi = self.json_file_tkdi.read()
        self.json_file_tkdi.close()
        self.loaded_model_tkdi = model_from_json(self.model_json_tkdi)
        self.loaded_model_tkdi.load_weights(self.directory+"model-bw_tkdi.h5")

        self.json_file_smn = open(self.directory+"model-bw_smn.json" , "r")
        self.model_json_smn = self.json_file_smn.read()
        self.json_file_smn.close()
        self.loaded_model_smn = model_from_json(self.model_json_smn)
        self.loaded_model_smn.load_weights(self.directory+"model-bw_smn.h5")
        
        self.ct = {}
        self.ct['blank'] = 0
        self.blank_flag = 0
        for i in ascii_uppercase:
          self.ct[i] = 0
        print("Loaded model from disk")
        self.root = tk.Tk()
        self.root.title("SIGN LANGUAGE ILLUSTRATOR")
        self.root.protocol('WM_DELETE_WINDOW', self.destructor)
        self.root.attributes('-fullscreen', True)
        self.root.bind('<Escape>', lambda e: self.root.destroy())
        #self.root.geometry("1080x1920")
        
        self.panel = tk.Label(self.root)
        self.panel.place(x = 135, y = 10, width = 640, height = 640)
        
        self.panel2 = tk.Label(self.root) # initialize image panel
        self.panel2.place(x = 460, y = 95, width = 310, height = 310)
        
        self.T = tk.Label(self.root)
        self.T.place(x=21, y = 17)
        self.T.config(text = "SIGN LANGUAGE TO TEXT",font=("sans-serif",25,"bold"))
        
        self.panel3 = tk.Label(self.root) # Current SYmbol
        self.panel3.place(x = 1100,y=90)
        
        self.T1 = tk.Label(self.root)
        self.T1.place(x = 900,y = 90)
        self.T1.config(text="Character :",font=("sans-serif",15,"bold"))
        
        self.panel4 = tk.Label(self.root) # Word
        self.panel4.place(x = 1100 ,y=130)
        
        self.T2 = tk.Label(self.root)
        self.T2.place(x = 900,y = 130)
        self.T2.config(text ="Word :",font=("sans-serif",15,"bold"))
        
        self.panel5 = tk.Label(self.root) # Sentence
        self.panel5.place(x = 1100,y=170)
        
        self.T3 = tk.Label(self.root)
        self.T3.place(x = 900,y = 170)
        self.T3.config(text ="Sentence :",font=("sans-serif",15,"bold"))

        self.T4 = tk.Label(self.root)
        self.T4.place(x = 900,y = 220)
        self.T4.config(text = "Suggestions :-",fg="blue",font = ("sans-serif",20,"bold"))

        self.bt1=tk.Button(self.root, command=self.action1,height = 0,width = 0)
        self.bt1.place(x = 900,y=260)
        #self.bt1.grid(padx = 900, pady = 180)
        
        self.bt2=tk.Button(self.root, command=self.action2,height = 0,width = 0)
        self.bt2.place(x = 1200,y=260)            
       # self.bt2.grid(row = 4, column = 1, columnspan = 1, padx = 900, pady = 210, sticky = tk.NW)
        
        self.panel3.place(x = 1100,y=90)

        self.bt3=tk.Button(self.root, command=self.action3,height = 0,width = 0)
        self.bt3.place(x = 900,y=300)
        #self.bt3.grid(row = 4, column = 2, columnspan = 1, padx = 900, pady = 270, sticky = tk.NW)
        
        self.bt4=tk.Button(self.root, command=self.action4,height = 0,width = 0)
        self.bt4.place(x = 1200,y=300)
       # self.bt4.grid(row = 5, column = 0, columnspan = 1, padx = 900, pady = 300, sticky = tk.N)
        
        self.bt5=tk.Button(self.root, command=self.action5,height = 0,width = 0)
        self.bt5.place(x = 1050,y=340)
       # self.bt5.grid(row = 5, column = 1, columnspan = 1, padx = 900, pady = 330, sticky = tk.N)
        
        
        self.image1 = Image.open("C:/Users/vyaso/OneDrive/Documents/Sign-Language-to-Text-master/Sign-Language-to-Text-master/sign.jpg")
        self.image1 = self.image1.resize((400, 300), Image.ANTIALIAS)
        test = ImageTk.PhotoImage(self.image1)
        self.label1 = tk.Label(image=test)
        self.label1.image = test     
        # Position image
        self.label1.place(x=900, y=420)
        
        self.str=""
        self.word=""
        self.current_symbol="Empty"
        self.photo="Empty"
        self.video_loop()
Esempio n. 16
0
import pickle
import gensim
import pandas as pd
import spacy
import hunspell

from util.SpellChecker import check_spell
from util.TextPreprocessor import clean_text
from util.UserStoryParser import parse_user_story

dic_es = hunspell.Hunspell("es_ES", "es_ES")
dic_en = hunspell.Hunspell("en_US", "en_US")

try:
    with open('./../word2vec.pickle', 'rb') as handle:
        word2vec = pickle.load(handle)
except FileNotFoundError:
    word2vec = gensim.models.KeyedVectors.load_word2vec_format(
        './../resources/SBW-vectors-300-min5.bin', binary=True)
    with open('./../word2vec.pickle', 'wb') as handle:
        pickle.dump(word2vec, handle, protocol=pickle.HIGHEST_PROTOCOL)


def label_user_stories():
    user_stories_df = pd.read_csv("./../resources/data_aug.csv", sep="\t")

    user_stories_df["cleaned"] = user_stories_df.apply(
        lambda row: clean_text(row["description"]), axis=1)
    user_stories = [parse_user_story(us) for us in user_stories_df["cleaned"]]

    spacy_nlp = spacy.load('es')
Esempio n. 17
0
 def test_given_incorrect_word_when_testing_then_false(self):
     word = "karkoweczka"
     res = prp.is_correct(word, hun=hunspell.Hunspell('pl'))
     self.assertFalse(res)
    def __init__(self, obj):

        self.root = Toplevel(obj.root)
        self.root.title("Gesture to Text and Voice")
        self.root.protocol('WM_DELETE_WINDOW', self.destructor)
        self.root.geometry('1600x1000')
        self.root.configure(bg="gray11")
        self.root.withdraw()
        splash = Splash(self)

        #RGB feed
        self.panel1 = Label(self.root)
        self.panel1.place(x=500, y=-60, width=600, height=600)

        #binary feed
        self.panel2 = Label(self.root)
        self.panel2.place(x=800, y=95, width=250, height=250)

        #Gesture Chart
        self.gesturechart = cv2.imread("images\\canny.png")
        self.panel3 = Label(self.root)
        self.panel3.place(x=50, y=0, width=400, height=500)
        self.im1 = cv2.cvtColor(self.gesturechart, cv2.COLOR_BGR2RGBA)
        self.im1 = cv2.resize(self.im1, (400, 500))
        self.im1 = Image.fromarray(self.im1)
        self.g1 = ImageTk.PhotoImage(image=self.im1)
        self.panel3.img = self.g1
        self.panel3.configure(image=self.g1)

        #sentence
        self.label1 = Label(self.root,
                            text="Sentence:",
                            bg="gray1",
                            fg="red",
                            font=("Courier", 20))
        self.label1.place(x=50, y=700)

        self.label2 = Label(self.root, bg="gray1", fg="yellow")
        self.label2.place(x=250, y=700)

        #Predicted letter
        self.label3 = Label(self.root,
                            text="Predicted letter:",
                            bg="gray1",
                            fg="red",
                            font=("Courier", 20))
        self.label3.place(x=50, y=580)

        self.label4 = Label(self.root,
                            bg="gray1",
                            fg="cyan",
                            font=("Courier", 20))
        self.label4.place(x=350, y=580)

        #word formation
        self.label5 = Label(self.root,
                            text="Current Word:",
                            bg="gray1",
                            fg="red",
                            font=("Courier", 20))
        self.label5.place(x=50, y=640)

        self.label6 = Label(self.root,
                            bg="gray1",
                            fg="cyan",
                            font=("Courier", 20))
        self.label6.place(x=350, y=640)
        #suggestion header
        self.label7 = Label(self.root,
                            bg="gray1",
                            text="Suggestions",
                            fg="spring green",
                            font=("Courier", 30))
        self.label7.place(x=1200, y=10)

        #Buttons

        #enter buuton
        self.bt0 = Button(self.root,
                          text="Enter",
                          font=("Courier", 15, "bold"),
                          command=self.suggest,
                          bg="gray9",
                          fg="deepskyblue2")
        self.bt0.place(x=600, y=500)
        #sign to speech buuton
        self.bt_speak = Button(self.root,
                               text="Speak",
                               font=("Courier", 15, "bold"),
                               command=self.gesture_to_voice,
                               bg="gray9",
                               fg="deepskyblue2")
        self.bt_speak.place(x=800, y=500)

        #reset buttons
        self.bt_reset = Button(self.root,
                               text="Reset",
                               font=("Courier", 15, "bold"),
                               command=self.reset,
                               bg="gray9",
                               fg="deepskyblue2")
        self.bt_reset.place(x=1000, y=500)

        #Suggestion buttons
        self.bt1 = Button(self.root,
                          command=self.action1,
                          bg="gray9",
                          fg="pink",
                          height=0,
                          width=0)
        self.bt1.place(x=1125, y=60)

        self.bt2 = Button(self.root,
                          command=self.action2,
                          bg="gray9",
                          fg="pink",
                          height=0,
                          width=0)
        self.bt2.place(x=1265, y=110)

        self.bt3 = Button(self.root,
                          command=self.action3,
                          bg="gray9",
                          fg="pink",
                          height=0,
                          width=0)
        self.bt3.place(x=1405, y=60)

        self.bt4 = Button(self.root,
                          command=self.action4,
                          bg="gray9",
                          fg="pink",
                          height=0,
                          width=0)
        self.bt4.place(x=1125, y=160)

        self.bt5 = Button(self.root,
                          command=self.action5,
                          bg="gray9",
                          fg="pink",
                          height=0,
                          width=0)
        self.bt5.place(x=1265, y=210)

        self.bt6 = Button(self.root,
                          command=self.action6,
                          bg="gray9",
                          fg="pink",
                          height=0,
                          width=0)
        self.bt6.place(x=1405, y=160)

        self.bt7 = Button(self.root,
                          command=self.action7,
                          bg="gray9",
                          fg="pink",
                          height=0,
                          width=0)
        self.bt7.place(x=1125, y=270)

        self.bt8 = Button(self.root,
                          command=self.action8,
                          bg="gray9",
                          fg="pink",
                          height=0,
                          width=0)
        self.bt8.place(x=1265, y=330)

        self.bt9 = Button(self.root,
                          command=self.action9,
                          bg="gray9",
                          fg="pink",
                          height=0,
                          width=0)
        self.bt9.place(x=1405, y=270)

        self.bt10 = Button(self.root,
                           command=self.action10,
                           bg="gray9",
                           fg="pink",
                           height=0,
                           width=0)
        self.bt10.place(x=1125, y=390)

        self.bt11 = Button(self.root,
                           command=self.action11,
                           bg="gray9",
                           fg="pink",
                           height=0,
                           width=0)
        self.bt11.place(x=1265, y=450)

        self.bt12 = Button(self.root,
                           command=self.action12,
                           bg="gray9",
                           fg="pink",
                           height=0,
                           width=0)
        self.bt12.place(x=1405, y=390)

        self.bw_model = obj.bw_model
        self.dru_model = obj.dru_model
        self.tkdi_model = obj.tkdi_model
        self.vw_model = obj.vw_model
        self.aesmn_model = obj.aesmn_model

        self.directory = 'model\\'
        self.cam = cv2.VideoCapture(0)
        self.current_image = None
        self.canny_img = None
        self.image_x = 128
        self.image_y = 128

        self.ct = {}
        self.ct['blank'] = 0
        self.blank_flag = 0
        for i in ascii_uppercase:
            self.ct[i] = 0

        self.sentence = ""
        self.word = ""
        self.current_symbol = "Empty"

        curr_dir = os.getcwd()

        dir1 = curr_dir + '/dictionary'

        self.h = hunspell.Hunspell('en_US', hunspell_data_dir=dir1)

        self.engine = pyttsx3.init()
        self.voices = self.engine.getProperty('voices')
        self.engine.setProperty('voice', self.voices[1].id)
        self.engine.setProperty('rate', 130)

        splash.destroy()
        self.root.deiconify()
        self.videoloop()
Esempio n. 19
0
    os.mkdir(clean_tables_dir)

tables_csv = [
    f for f in os.listdir(tables_dir)
    if os.path.isfile(os.path.join(tables_dir, f))
]

# Import standard Slovak vocabulary corpus and dictionary
import hunspell

normal_SK = os.path.join(working_dir, 'Dicts\\sk_SK')
english_US = os.path.join(working_dir, 'Dicts\\en_US')
special_SK = os.path.join(working_dir, 'Dicts\\sk_SK_special')

# Dictionary with standard Slovak language and words from contracts in this sector by build_special_dictionary.py
hunspell_normal = hunspell.Hunspell(normal_SK, normal_SK)
hunspell_english = hunspell.Hunspell(english_US, english_US)
hunspell_special = hunspell.Hunspell(normal_SK, special_SK)


# Own spellcheck function also making sure word is case-folded and whitespace is stripped
def spell(word):
    word = word.casefold().strip()
    return (hunspell_normal.spell(word) or hunspell_english.spell(word)
            or hunspell_special.spell(word))


# Import keywords and add them to the special dictionary for spellchecking
fo = open('keywords.txt', 'r', encoding='utf-8')
lines = fo.readlines()
fo.close()
            new_word = new_word + char
            word = True
        else:
            if word:
                words.append(new_word)
                new_word = ''
            word = False

    return words


# Import standard Slovak dictionary
normal_SK = os.path.join(os.getcwd(), 'Dicts\\sk_SK')
english_US = os.path.join(os.getcwd(), 'Dicts\\en_US')

hunspell_normal = hunspell.Hunspell(normal_SK, normal_SK)
hunspell_english = hunspell.Hunspell(english_US, english_US)


def check_normal(word):
    return hunspell_normal.spell(word) or hunspell_english.spell(word)


# Find all text contracts
find_txt = re.compile('txt')
working_dir = os.getcwd() + '\\IT_contracts_text\\'

contracts = [
    f for f in os.listdir(working_dir)
    if os.path.isfile(os.path.join(working_dir, f))
]