def __init__(self, modelFile): super(JamspellCorrector, self).__init__() import jamspell self.model = jamspell.TSpellCorrector() # self.model.SetPenalty(16.0, 0.0) if not (self.model.LoadLangModel(modelFile)): raise Exception('wrong model file: %s' % modelFile)
def spell_tokenizer(text): """ Perform word tokenization using casual_tokenize after spelling correction :param text: string without punctuation :return: list of tokens """ tokens = [] corrector = jamspell.TSpellCorrector() corrector.LoadLangModel('model_en.bin') for word in casual_tokenize(rm_punctuation(text), preserve_case=False, reduce_len=True, strip_handles=True): if not (bool(re.search(r'\d', word))): corr_word = corrector.GetCandidates([word], 0) if (len(corr_word) > 0) and (word != corr_word[0]): for candidate in corr_word[:1]: tokens.append(candidate) else: tokens.append(word) wordnet_lemmatizer = WordNetLemmatizer() stems = [wordnet_lemmatizer.lemmatize(item) for item in tokens] # stemmer = PorterStemmer() # stems = [stemmer.stem(item) for item in tokens] return stems
def main(): "main function" # optional command line args parser = argparse.ArgumentParser() parser.add_argument("--train", help="train the NN", action="store_true") parser.add_argument("--validate", help="validate the NN", action="store_true") parser.add_argument("--beamsearch", help="use beam search instead of best path decoding", action="store_true") args = parser.parse_args() # train or validate on IAM dataset if args.train or args.validate: # load training data, create TF model loader = DataLoader(FilePaths.fnTrain, Model.batchSize, Model.imgSize, Model.maxTextLen) # save characters of model for inference mode open(FilePaths.fnCharList, 'w').write(str().join(loader.charList)) # execute training or validation if args.train: model = Model(loader.charList, args.beamsearch) train(model, loader) elif args.validate: model = Model(loader.charList, args.beamsearch, mustRestore=True) validate(model, loader) # infer text on test image else: sentence_list = [] #print(open(FilePaths.fnAccuracy).read()) model = Model(open(FilePaths.fnCharList).read(), args.beamsearch, mustRestore=True) imgFiles = os.listdir('../../WordSegmentation/out/1.png') for (i, f) in enumerate(imgFiles): print(' recognised the word %s' % f) # read image, prepare it by resizing it to fixed height and converting it to grayscale img1 = '../../WordSegmentation/out/1.png/' + f #img = prepareImg(cv2.imread('11.png/%s'%f), 50) infer(model, img1, sentence_list, f) #infer(model, FilePaths.fnInfer) sentence_list = sorted(sentence_list, key=lambda entry: entry[0][0]) sentence = "" for x, y in sentence_list: sentence = sentence + " " + y text_file = open("sentence.txt", "w") corrector = jamspell.TSpellCorrector() print("yo") corrector.LoadLangModel('en.bin') print("yo") sentence = corrector.FixFragment(sentence) print(sentence) text_file.write(sentence) text_file.close()
def spell_correct_context(query_str): corrector = jamspell.TSpellCorrector() # Create a corrector corrector.LoadLangModel('./en.bin') list_of_words = get_list(query_str) #PRINTING THE CANDIDATES # for i in range(len(list_of_words)): # print(list_of_words[i]+" -> ", corrector.GetCandidates(list_of_words, i)) # print("Did you mean " + "'"+corrector.FixFragment(query_str)+ "'"+"?") return corrector.FixFragment(query_str)
def __init__(self, f_name_jamspell_model=None): self.max_sequence_length = None self.corrector = jamspell.TSpellCorrector() if f_name_jamspell_model != None: if os.path.isfile(f_name_jamspell_model) == False: print('[E] Языковая модель %s для JamSpell не найдена.' % f_name_jamspell_model) return self.corrector.LoadLangModel(f_name_jamspell_model)
def __loads(self): ## load base model if not os.path.exists("model"): raise ValueError self.corrector = jamspell.TSpellCorrector() if not self.corrector.LoadLangModel('ru_small.bin'): raise ValueError # Обученная модель для русского языка self.model = Model("model") self.morph = pymorphy2.MorphAnalyzer()
def __init__(self, tokenize=True, pretrained=False, device="cpu"): self.tokenize = tokenize self.pretrained = None self.device = None self.ckpt_path = None self.vocab_path, self.weights_path = "", "" self.model, self.vocab = None, None self.model = jamspell.TSpellCorrector() self.model.LoadLangModel('en.bin')
def __init__(self): self.stop = set(stopwords.words('english')) self.stop.update([ '.', ',', '"', "'", '?', '!', ':', ';', '(', ')', '[', ']', '{', '}' ]) self.api_url = settings.W2V_API_URL self.candidate_sentence = None self.right_sentence = None self.similarity = None self.corrector = jamspell.TSpellCorrector() self.corrector.LoadLangModel('en.bin')
def __init__(self): self.simple_charectrs = set( list( u"abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ0123456789АаБбВвГгДдЕеЁёЖжЗзИиЙйКкЛлМмНнОоПпРрСсТтУуФфХхЦцЧчШшЩщЪъЫыЬьЭэЮюЯя" )) self.rus_charectrs = set( list( u"АаБбВвГгДдЕеЁёЖжЗзИиЙйКкЛлМмНнОоПпРрСсТтУуФфХхЦцЧчШшЩщЪъЫыЬьЭэЮюЯя" )) self.corrector = jamspell.TSpellCorrector() self.corrector.LoadLangModel("ru_small.bin") self.morph = pymorphy2.MorphAnalyzer()
def normalize_queries(): get_stopwords() corrector = jamspell.TSpellCorrector() corrector.LoadLangModel('./jamspell_model/ru_small.bin') with open("queries_new.tsv", "r") as q_file_from, open("queries_norm.tsv", "w") as q_file_to: reader = csv.reader(q_file_from, delimiter='\t') writer = csv.writer(q_file_to, delimiter='\t') for row in tqdm(reader): i = int(row[0]) query = row[1] query = get_google_spelling(query) query = normalize(query) # query = corrector.FixFragment(row[1]) writer.writerow([row[0], query])
def __init__(self, jamspell_model_path: str = DEFAULT_JAMSPELL_MODEL_PATH, correction_mapping_path: str = DEFAULT_MAPPING_PATH, stanza_model_dir_path=DEFAULT_STANZA_DIR_PATH, use_gpu=False): """ :param jamspell_model_path: Relative or absolute path to the Jamspell binary model file. :param correction_mapping_path: Relative or absolute path to the CSV file which conducts word replacements. :param stanza_model_dir_path: Relative or absolute path to the directory IN WHICH the Estonian Stanza models reside in. :param use_gpu: Whether to use the GPU's CUDA support for Stanza operations. """ self.stanza_pipeline = None self.corrector = jamspell.TSpellCorrector() self.ensure_model_folders() self.word_mapping = self.load_mapper_resources(correction_mapping_path) self._load_corrector_resources(jamspell_model_path) self._load_lemmatizer_resources(stanza_model_dir_path, use_gpu)
def jamspell(model: str = 'wiki', **kwargs): """ Load a jamspell Spell Corrector for Malay. Parameters ---------- model: str, optional (default='wiki+news') Supported models. Allowed values: * ``'wiki+news'`` - Wikipedia + News, 337MB. * ``'wiki'`` - Wikipedia, 148MB. * ``'news'`` - local news, 215MB. Returns ------- result: malaya.spell.JamSpell class """ try: import jamspell as jamspellpy except BaseException: raise ModuleNotFoundError( 'jamspell not installed. Please install it and try again.') model = model.lower() supported_models = ['wiki+news', 'wiki', 'news'] if model not in supported_models: raise ValueError( f'model not supported, available models are {str(supported_models)}' ) path = check_file(PATH_NGRAM['jamspell'][model], S3_PATH_NGRAM['jamspell'][model], **kwargs) try: corrector = jamspellpy.TSpellCorrector() corrector.LoadLangModel(path['model']) except BaseException: raise Exception( f"failed to load jamspell model, please run `malaya.utils.delete_cache('preprocessing/jamspell/{model.replace('+', '-')}')`" ) return JamSpell(corrector=corrector)
def __init__(self, stopwords, lang='ru', type_preproc=None, speech_2_vec_lemm=False, way_to_udp=None, keep_pos_udp=True, keep_punct_udp=False, model_checker='ru_small.bin', parallel_workers=20): """ Очистка текста от спецсимволов и стоп-слов stopwords: list Список стоп-слов lang: {'ru', 'eng'}, default='ru' Язык текста type_preproc: {'stem', 'lemm', 'udp', None}, default=None Тип предобработки - стемминг, лемматизация, обработка UDPipe парсером или ничего. Стемминг осуществляется через библиотеку nltk (SnowballStemmer). Лемматизация осуществляется через библиотеку pymorphy2. К лемме также можно добавить часть речи из тэгов pymorphy2 через нижнее подчеркивание '_' - 'делать_VERB'. Обработка UDPipe парсером осуществляется через библиотеку ufal.udpipe. Обработка включает в себя лемматизацию с возможностью добавления части речи через нижнее подчеркивание (тэги частей речи соответствуют формату Universal PoS Tags) '_' - 'делать_VERB', также существует возможность сохранения пунктуации. speech_2_vec_lemm: bool, default=False Флаг для добавления части речи в предобработке с лемматизацией (type_preproc='lemm') way_to_udp: str, default=None Путь до обученной модели UDPipe keep_pos_udp: bool, default=True Флаг добавления части речи к лемме keep_punct_udp: bool, default=False Флаг сохранения пунктуации model_checker: str, default='ru_small.bin' Путь до модели проверки орфографии jamspell parallel_workers: int, default=20 Кол-во процессорорв для распараллеливания """ self.lang = lang self.replace_by_space_re = re.compile('[/(){}\[\]\|@\.,:;!?-]|[\s]') # шаблон под замену данных символов на пробел self.bad_symbols_re = re.compile('[^0-9а-я #+_]') # шаблон под удаление всех символов кроме этих self.stopwords_re = r'\b' + r'\b|\b'.join(stopwords) + r'\b' # шаблон для чистки стоп-слов self.type_preproc = type_preproc self.speech_2_vec_lemm = speech_2_vec_lemm self.way_to_udp = way_to_udp self.keep_pos_udp = keep_pos_udp self.keep_punct_udp = keep_punct_udp self.corrector = jamspell.TSpellCorrector() self.corrector.LoadLangModel(model_checker) self.parallel_workers = parallel_workers
def corrector_data(audio_wav_path: str): # THIS RECOGNATIONS SPITH TO TEXT res = get_data_in_audio(audio_wav) import jamspell from nltk.tokenize import RegexpTokenizer from nltk.corpus import stopwords from nltk.tokenize import sent_tokenize, word_tokenize corrector = jamspell.TSpellCorrector() corrector.LoadLangModel('ru_small.bin') # data = [[[i.normal_form for i in morph.parse(j) if i.score == 1 or i.score == 0.5] for j in # corrector.GetCandidates([i, ], 0)] for i in raw] # tokenizer = RegexpTokenizer(r'\w+') stop_words = stopwords.words("russian") mega_set = set() for l in res: for token in word_tokenize(l, language="russian"): if token in stop_words: continue token_norm_set = norm_dict(corrector, token) mega_set |= token_norm_set return mega_set
if v0 != v1: cprint( f"[{ext_name}] invalid value for enum attribute \"{p}\": {v0} != {v1}", 'magenta') d0 = v0 - v1 d1 = v1 - v0 if len(d0): cprint(f"[{ext_name}] missing elements are: {d0}", 'magenta') if len(d1): cprint( f"[{ext_name}] invalid elements in doc are: {d1}", 'magenta') elif attr == HAVE_EXTERNAL: if ext_name.startswith("ui.") and p not in ("@fontname"): cprint(f"DOC [{ext_name}] missing enum attribute\"{p}\"", 'magenta') elif attr == HAVE_PDDOC: cprint( f"DOC [{ext_name}] no enum for attribute \"{p}\" (in external)", 'magenta') if args.spell: import jamspell corrector = jamspell.TSpellCorrector() corrector.LoadLangModel('ceammc.bin') cprint(f"checking [{ext_name}] ...", "blue") check_spell(root)
def spellCorrect(self): corrector = jamspell.TSpellCorrector() corrector.LoadLangModel('en.bin') output = corrector.FixFragment(self.fileContent) return output
def permutative_solve(detected_bank, detected_puzzle=None): #detected_bank = [word.lower() for word in detected_bank] if detected_puzzle == None: detected_puzzle = [] with open('test_searches/word_search.txt', 'r') as f: line = f.readline() while line: detected_puzzle.append(line[:-1].upper()) detected_puzzle[-1] = detected_puzzle[-1].split(' ') line = f.readline() else: size = len(detected_puzzle) temp_puzzle = [['|' for _ in range(size)] for _ in range(size)] for i in range(len(detected_puzzle)): for j in range(len(detected_puzzle[i])): temp_puzzle[i][j] = detected_puzzle[i][j] print(temp_puzzle) solver = ps.PuzzleSolver(len(detected_puzzle[0]), len(detected_puzzle), detected_puzzle, detected_bank) print('-------------------SOLVING PUZZLE----------------------') incorrect_words, found = solver.solve() if len(incorrect_words) == 0: print('\nALL WORDS FOUND!') return found print('-----------------RETRYING WITH SWAPPED CHARACTERS----------------') potential_words = [] for word in incorrect_words: permutations = letter_swap(word) permutations.append(word) potential_words.append(permutations) print(potential_words) for word in solver.potential_words_solve(incorrect_words, potential_words): found.append(word) if len(incorrect_words) == 0: print('\nALL WORDS FOUND!') return found print('-----------------RETRYING WITH NEW BANK----------------') corrector = jamspell.TSpellCorrector() corrector.LoadLangModel('protos_data/en.bin') potential_words = [] # get word candidates that may be the correct target word, put them in potential_words lower_case_attempts = [] for word in incorrect_words: lower_case_attempts.append(word.lower()) for i in range(len(lower_case_attempts)): candidates = list(corrector.GetCandidates(lower_case_attempts, i)) candidates = [word.upper() for word in candidates] candidates.append(incorrect_words[i]) potential_words.append(candidates) if len(potential_words) == 0: print( '\nCould not find alternate spellings for the following words: ', incorrect_words, ) return found else: print('Incorrect words', incorrect_words) for word in solver.potential_words_solve(incorrect_words, potential_words): found.append(word) # Found every word! Done if len(incorrect_words) == 0: print('\nALL WORDS FOUND!') return found print('SOME WORDS NOT FOUND:', incorrect_words) return found
def trainLangModel(trainText, alphabetFile, modelFile): corrector = jamspell.TSpellCorrector() corrector.TrainLangModel(trainText, alphabetFile, modelFile)
def spell_corr(): corrector = jamspell.TSpellCorrector() corrector.LoadLangModel('model_en.bin') logger.info(f"corrector created: {result}") return corrector
def get_spell_checker(): spell_checker = jamspell.TSpellCorrector() spell_checker.LoadLangModel(get_jamspell_model_file_name()) return spell_checker