def validate_spelling(tree, filename, options): """ Checks spelling of text within tags. If options['learn'], then unknown words will be added to the dictionary. """ result = True try: speller = aspell.Speller(('lang', 'en'), ('personal-dir', '.'), ('personal', VOCABULARY)) except: # some versions of aspell use a different path speller = aspell.Speller(('lang', 'en'), ('personal-path', './' + VOCABULARY)) if options['debug']: [print(i[0] + ' ' + str(i[2]) + '\n') for i in speller.ConfigKeys()] try: root = tree.getroot() for section in root.iter(): if section.text and isinstance(section.tag, basestring) and \ section.tag not in ('a', 'code', 'monospace', 'pre'): for word in re.findall('([a-zA-Z]+\'?[a-zA-Z]+)', section.text): if not speller.check(word): if options['learn']: speller.addtoPersonal(word) else: result = False print('[-] Misspelled (unknown) word {0} in {1}'. format(word.encode('utf-8'), filename)) if options['learn']: speller.saveAllwords() except aspell.AspellSpellerError as exception: print('[-] Spelling disabled ({0})'.format(exception)) return result
def evaluate_aspell_builtin(input, lang_code): """ """ import enchant import aspell input = build_article_information(input) result_content = "{ \"predictions\": [\n" for aidx, article in enumerate(input): for sidx, sentence in enumerate(article.sentences): chkr = aspell.Speller('lang', lang_code.split("_")[0]) tokens, spaces = call_regex(sentence) shift = 0 for tidx, t in enumerate(tokens): if t == "\"": t = t.replace("\"", "\\\"") if t == "\\": t = t.replace("\\", "\\\\") token = t suggestions = [] try: if chkr.check(t) == False: sugg = chkr.suggest(t) if len(sugg) > 0: tempSuggestion = sugg[0].strip() if (" " in tempSuggestion): multi_tokens = tempSuggestion.split(" ") token = None else: token = tempSuggestion suggestions = sugg[1:] except: token = t if token == None: # is none, so tokens is filled with multiple elements -> splitted word num_tokens = len(multi_tokens) for idx, tt in enumerate(multi_tokens): result_content += generate_token_information( aidx, sidx, tidx + idx, tt, suggestions, spaces[tidx], tidx < (len(tokens) + shift - 1)) shift += num_tokens - 1 else: result_content += generate_token_information( aidx, sidx, tidx + shift, token, suggestions, spaces[tidx], tidx < (len(tokens) + shift - 1)) if ((aidx < (len(input) - 1)) or (sidx < len(article.sentences) - 1)): if result_content[-1] != "," and result_content[-2] != ",": result_content += ",\n" result_content += " ]\n}" #print("DEBUG: ") #print(result_content) return result_content
def set_line_manager(self, line_manager_): if line_manager_: self.lm = line_manager_ if CAN_ASPELL: self.spell_checker = line_manager_.spell_checker print line_manager_.spell_checker.lang self.speller = aspell.Speller('lang', line_manager_.spell_checker.lang) next_error_button = wx.Button(self.panel, wx.ID_ANY, label='Next Error', size=(90, 30)) self.Bind(wx.EVT_BUTTON, self.OnNextBadLine, next_error_button) next_error_button.SetDefault() next_error_button.SetSize(next_error_button.GetBestSize()) self.current_text.Add(next_error_button, row=2, col=1) if self.speller: add_to_dictionary_button = wx.Button(self.panel, wx.ID_ANY, label='+ to Dict', size=(90, 30)) self.Bind(wx.EVT_BUTTON, self.OnAddToDict, add_to_dictionary_button) add_to_dictionary_button.SetDefault() add_to_dictionary_button.SetSize( add_to_dictionary_button.GetBestSize()) self.current_text.Add(add_to_dictionary_button, row=5, col=1) # Sizers for layout self.panel.SetSizerAndFit(self.current_text)
def spell_checker(self, url, words=[]): """Spell checker. :param url: webpage url :param words: expected word list :return: list of misspelled words """ self.open(url) cleanr = re.compile('<.*?>') page_content = re.sub(cleanr, '', self.get_page_source()) cleantext = [] speller_obj = aspell.Speller("lang", "en") if len(words): for word in words: speller_obj.addtoSession(word) invalidchars = set(string.punctuation.replace("_", "")) for word in nltk.word_tokenize(page_content): if any(invalidchar in word for invalidchar in invalidchars) or \ len(word) < 2: continue else: cleantext.append(word) misspelled = list( set([ word.encode('ascii', 'ignore') for word in cleantext if not speller_obj.check(word) and re.match('^[a-zA-Z ]*$', word) ])) return misspelled
def query_handler(query): correcter = aspell.Speller('lang', 'en') query = query.split() correct_query = [] for term in query: if len(correcter.suggest(term)) == 0: continue correct_query.append(correcter.suggest(term)[0].lower()) return correct_query
def construct_globals(): global MATCH_ALPHA_WORD, LOWER, speller, word_set, detokenizer MATCH_ALPHA_WORD = "[A-Za-zĂÂÎȘȚăâîșț]+" LOWER = [chr(i) for i in range(ord('a'), ord('z') + 1)] LOWER += list("ăâșîț") speller = aspell.Speller('lang', 'ro') word_set = set() detokenizer = Detok()
def __init__(self, wordlist): SpChecker.__init__(self, wordlist) self.compile_dict(wordlist) params = [ ('master', './' + self.dictfile), ('master-path', './' + self.dictfile), ] self.sp = aspell.Speller(*params)
def __init__(self, lang): if lang in ['pt', 'pt_BR']: data_dir = os.path.expanduser('~/root/tmp/usr/lib64/aspell') else: data_dir = os.path.expanduser('~/root/usr/lib/aspell') self.speller = aspell.Speller( ('data-dir', data_dir), ('dict-dir', data_dir), ('size', '80'), ('sug-mode', 'fast'), ('encoding', 'utf-8'), ('lang', lang))
def cleanUnigramCountFile(inputfile, outputfile, n, language, filterByDictionary): '''filter the unigram count file, and reduce the number of items in it''' df = pandas.read_table(inputfile, encoding='utf-8') df.columns = ['word', 'count'] #take some multiple of items to run the filters on #discard purely numeric items df_nonnumeric = df[[type(x) is unicode for x in df['word']]] #discard the <s> string df_clean = df_nonnumeric[[x != u'</s>' for x in df_nonnumeric['word']]] #delete apostrophes, numbers df_clean['word'] = [re.sub(u"’|'|\d", '', x) for x in df_clean['word']] #check for any empty strings df_clean = df_clean[[x != '' and x is not None for x in df_clean['word']]] df_clean['word'] = [cleanString(x) for x in df_clean['word']] #check whether the upper and lower case is in the dictionary aspellLang = language if aspellLang == 'pt': aspellLang = 'pt-BR' speller = aspell.Speller(('lang', aspellLang), ('encoding', 'utf-8')) df_clean['aspell_upper'] = [ speller.check(x.lower().encode('utf-8')) == 1 for x in df_clean['word'] ] df_clean['aspell_lower'] = [ speller.check(x.title().encode('utf-8')) == 1 for x in df_clean['word'] ] #Convert anything that can be lower case to lower case df_clean['word'][df_clean['aspell_lower']] = [ x.lower() for x in df_clean['word'][df_clean['aspell_lower']] ] if filterByDictionary: #check the rejected words #df_clean.ix[~df_clean['aspell']] if language == 'de': #German nouns are capitalized df_clean = df_clean.ix[np.logical_or(df_clean['aspell_lower'], df_clean['aspell_upper'])] else: df_clean = df_clean.ix[df_clean['aspell_lower']] to_write = df_clean.drop(['aspell_lower', 'aspell_upper'], axis=1) to_write['word'] = [x.lower() for x in to_write['word']] to_write.to_csv(outputfile, sep='\t', index=False, header=False, encoding='utf-8') print('Wrote to file: ' + outputfile)
def main(): nlp = English() speller = aspell.Speller('lang', 'en') for line in sys.stdin: text = line[:-1] tokens = tokenize(text, nlp=nlp) tokens = correct(tokens, speller=speller) text = untokenize(tokens) print(text)
def __init__(self, bot: UtilsBot): self.bot = bot self.speller = aspell.Speller('lang', 'en') self.api_db = self.bot.mongo.client.api.users app = web.Application() app.add_routes([web.post('/speak', self.handle_speak_message), web.post('/disconnect', self.handle_disconnect), web.get('/check_access', self.check_access), web.get('/avatar_urls', self.avatar_urls), web.get('/regen_img/{data}', self.regen_image)]) # noinspection PyProtectedMember self.bot.loop.create_task(self.start_site(app))
def set_line_manager(self, line_manager_, strict): self.strict = strict if line_manager_: self.lm = line_manager_ if CAN_ASPELL: self.spell_checker = line_manager_.spell_checker self.speller = aspell.Speller('lang', line_manager_.spell_checker.lang) button_row = 2 next_line_button = wx.Button(self.panel, wx.ID_ANY, label='Next Line', size=(90, 30)) self.Bind(wx.EVT_BUTTON, self.OnNextLine, next_line_button) next_line_button.SetDefault() next_line_button.SetSize(next_line_button.GetBestSize()) self.current_text.Add(next_line_button, row=button_row, col=1) button_row += 1 previous_line_button = wx.Button(self.panel, wx.ID_ANY, label='Prev Line', size=(90, 30)) self.Bind(wx.EVT_BUTTON, self.OnPreviousLine, previous_line_button) previous_line_button.SetDefault() previous_line_button.SetSize(previous_line_button.GetBestSize()) self.current_text.Add(previous_line_button, row=button_row, col=1) if self.strict: button_row += 1 join_line_button = wx.Button(self.panel, wx.ID_ANY, label='Join Lines', size=(90, 30)) self.Bind(wx.EVT_BUTTON, self.OnJoinLines, join_line_button) join_line_button.SetDefault() join_line_button.SetSize(join_line_button.GetBestSize()) self.current_text.Add(join_line_button, row=button_row, col=1) if self.speller: button_row += 1 add_to_dictionary_button = wx.Button(self.panel, wx.ID_ANY, label='+ to Dict', size=(90, 30)) self.Bind(wx.EVT_BUTTON, self.OnAddToDict, add_to_dictionary_button) add_to_dictionary_button.SetDefault() add_to_dictionary_button.SetSize( add_to_dictionary_button.GetBestSize()) self.current_text.Add(add_to_dictionary_button, row=button_row, col=1) # Sizers for layout self.panel.SetSizerAndFit(self.current_text)
def initialize_speller(): """ Initialize and return speller module. """ speller = None try: speller = aspell.Speller(('lang', 'en'), ('personal-dir', '.'), ('personal', VOCABULARY)) except aspell.AspellConfigError as exception: # some versions of aspell use a different path logging.debug( 'Encountered exception when trying to intialize spelling: %s', exception) try: speller = aspell.Speller(('lang', 'en'), ('personal-path', './' + VOCABULARY)) except aspell.AspellSpellerError as exception: logging.error('Could not initialize speller: %s', exception) if speller: [logging.debug('%s %s', i[0], i[2]) for i in speller.ConfigKeys()] return speller
def __init__(self, tokenize=True, pretrained=False, device="cpu"): self.tokenize = tokenize self.pretrained = None self.device = None self.ckpt_path = None self.vocab_path, self.weights_path = "", "" self.model, self.vocab = None, None self.model = aspell.Speller() self.model.setConfigKey( 'sug-mode', "normal") #ultra, fast, normal, slow, or bad-spellers
def main(filename): speller = aspell.Speller('lang', LANG) buffersize = 2**16 with open(filename) as f: while True: lines_buffer = f.readlines(buffersize) if not lines_buffer: break for line in lines_buffer: word = line.strip() if speller.check(word): print(word)
def missing_targets_aspell(target_words): s = ap.Speller('lang', 'en') missing_words = set() for word in target_words: if s.check(word) == False: missing_words.add(word) missing_words = list(missing_words) with open('../spelling_mistakes/aspell_missing_targets.txt', 'w') as aspell_misspelling: for word in missing_words: new_line = word + '\n' aspell_misspelling.write(new_line)
def check_one(bot, word): c = aspell.Speller('lang', 'en') if c.check(word): bot.say("I don't see any problems with that word.") return else: suggestions = c.suggest(word)[:5] if len(suggestions) == 0: bot.say("That doesn't seem to be correct.") else: bot.say("That doesn't seem to be correct. Try {0}.".format(', '.join( ['"{0}"'.format(s) for s in suggestions])))
def check_multiple(bot, words): mistakes = [] c = aspell.Speller('lang', 'en') for word in words: if not c.check(word): mistakes.append(word) if len(mistakes) == 0: bot.say("Nothing seems to be misspelled.") else: bot.say('The following word(s) seem to be misspelled: {0}'.format( ', '.join(['"{0}"'.format(w) for w in mistakes])))
def separate_waw(text): ar_spell = aspell.Speller('lang', 'ar') words = line.split() sentence = '' for word in words: if word.startswith('و'): if word in ar_spell: sentence += word + ' ' else: sentence += 'و ' + word[1:] + ' ' print('{} changed to {}'.format(word, 'و ' + word[1:])) else: sentence += word + ' ' return sentence
def bag_of_words_features(document, word_features, spell): features = {} s = aspell.Speller('lang', 'en') if spell: for i in range(len(document)): if s.check(document[i]) == 0: if len(s.suggest(document[i])) > 0 and len( s.suggest(document[i])) < 15 and s.suggest( document[i])[0] in word_features: document[i] = s.suggest(document[i])[0] document_words = set(document) for word in word_features: features['contains(%s)' % word] = (word in document) return features
def make_light_lexicon(infile, outfile): ar_spell = aspell.Speller(('dict-dir', './ar_dict/'), ('lang', 'ar'), ('encoding', 'utf-8')) lexicon = open(infile, encoding='utf-8').read().split() print(infile, 'size', len(lexicon)) light_lexicon = set() for word in lexicon: light_word = light_stem_word(word) if light_word != word and light_word not in lexicon \ and light_word in ar_spell: light_lexicon.add(light_word) light_lexicon = list(sorted(light_lexicon)) print('light size', len(light_lexicon)) with open(outfile, mode='w', encoding='utf-8') as file_writer: file_writer.write('\n'.join(light_lexicon))
def __init__(self, opts, *args, **qdict): BaseHTTPServer.HTTPServer.__init__(self, *args, **qdict) import nltk _SENTENCE_TOKENIZE_MODEL = "tokenizers/punkt/english.pickle" self.tokenizer = nltk.data.load(_SENTENCE_TOKENIZE_MODEL) self.speller = aspell.Speller('lang', 'en') try: sennabin = unicode(os.environ['SENNAPATH']) except KeyError: sennabin = u"/data/tool/senna/" self.senna = src.tools.senna.SennaWrap(sennabin) self.funcs = {'split':self.split, 'spell':self.spell, 'pas': self.pas, "score" : self.score} M_PATH = opts.model_dir self.model = SklearnClassifier() self.model.load_model(M_PATH) self.model.load_fmap(M_PATH)
def save_command(bot, trigger): """Commit pending changes to the bot's personal dictionary. This action cannot be undone, except by manually editing the aspell dictionary file. """ for word in bot.memory['spellcheck_pending_adds']: if word != word.strip() and trigger.group(2) != 'force': bot.say('"{0} contains extra whitespace. Amend the pending list with ' '{1}scdel/{1}scadd, or force saving anyway with {1}scsave force.' .format(word, bot.config.core.help_prefix)) return c = aspell.Speller('lang', 'en') for word in bot.memory['spellcheck_pending_adds']: c.addtoPersonal(word) c.saveAllwords() bot.say('Saved {0} pending words to my word list.' .format(len(bot.memory['spellcheck_pending_adds']))) del bot.memory['spellcheck_pending_adds'][:] # list.clear() is py3.3+ only :(
def go(): fn = "test-data/cyprob-page-000.txt" nm, ext = os.path.splitext(fn) new_fn = "{nm}-auto-corrected{ext}".format(nm=nm, ext=ext) greek_speller = aspell.Speller('lang', 'el') get_first_suggestion_ = partial(get_first_suggestion, speller=greek_speller) new_lines = list() with open(fn, "r") as fin: for ln in fin: nln = " ".join( [get_first_suggestion_(word=word) for word in ln.split()]) new_lines.append(nln) with open(new_fn, "w") as fout: fout.write("\n".join(new_lines))
def filterByDictionary(merged, dictionary_filter, language): if dictionary_filter is None: print('Not limiting words to a spelling dictionary') pass elif dictionary_filter in ('lowerInDictionary', 'inDictionary'): aspellLang = language if aspellLang == 'pt': aspellLang = 'pt-BR' speller = aspell.Speller(('lang', aspellLang), ('encoding', 'utf-8')) merged['aspell_lower'] = [ speller.check(x.lower().encode('utf-8')) == 1 for x in merged['word'] ] if dictionary_filter == 'lowerInDictionary': if aspellLang == 'de': raise ValueError( 'German must use inDictionary filter setting because all nouns are capitalized' ) print('Limiting to words with lower-case in spelling dictionary') #German nouns are capitalized, so need to check upper case merged = merged[ merged['aspell_lower']] #only take the upper case one elif dictionary_filter == 'inDictionary': print( 'Limiting to words with lower-case or upper-case in spelling dictionary' ) merged['aspell_upper'] = [ speller.check(x.title().encode('utf-8')) == 1 for x in merged['word'] ] #this should be checking if speller.check has x.upper merged = merged[merged['aspell_upper'] | merged['aspell_lower']] else: raise ValueError( 'Dictionary specification not recognized. Choose None, "lowerInDictioanry" or "inDictionary"' ) return (merged)
def aspell_dict(input_dict, name): """ Generates a dictionary of corrections carried out by the Aspell spelling correction tool. :param input_dict: input template dictionary :param name: name of dataset for use in file output write :return: Updated dictionary with suggestions and candidates, name of file written to """ file_name = name + "_aspell_dict.txt" spell = aspell.Speller('lang', 'en') working_dict = dict(input_dict) counter = 0 for misspelling, details in working_dict.items(): if counter % 100 == 0: print("Aspell iteration:", counter) suggestions = spell.suggest(misspelling) if len(suggestions) > 0: working_dict[misspelling]['suggested'] = suggestions[0] working_dict[misspelling]['candidates'] = suggestions[:10] counter += 1 return working_dict, file_name
def spellcheck(self, spellchecker, num, exclude_list=None): if exclude_list != None: exclude_list = exclude_list.split(',') if spellchecker == 'aspell': s = aspell.Speller('lang', 'en') self.replaced = {} for review in self.tokens: for i in range(len(review)): token = review[i].encode('utf8') if sum(char.isdigit() for char in review[i])/len(review[i]) > 0.5: # dont use encode/decode because then everything is an int if num == True: if exclude_list == None or review[i] not in exclude_list: review[i] = '_num' # encode to same token indicating a number is present elif s.check(token) == False: suggestions = s.suggest(token) if any([word.lower() == token.decode('utf8').lower() for word in suggestions]): replace = next((word for word in suggestions if word.lower() == token.decode('utf8').lower()), None) self.replaced[review[i]] = replace #cache results review[i] = replace else: frequencies = np.array((self.tokens_dist.freq(token.decode( 'utf8')) * self.tokens_dist.N() - 1)) # exclude this instance from the frequency count for j in range(len(suggestions)): frequencies = np.append(frequencies, self.tokens_dist.freq( suggestions[j]) * self.tokens_dist.N()) most_frequent_index = np.argmax(frequencies) if most_frequent_index != 0: replace = suggestions[most_frequent_index - 1] self.replaced[review[i]] = replace # cache results review[i] = replace elif num == True: for review in self.tokens: for i in range(len(review)): if sum(char.isdigit() for char in review[i]) / len(review[i]) > 0.5: if exclude_list == None or review[i] not in exclude_list: review[i] = '_num' # encode to same token indicating a number is present
class pluginClass(plugin): s = aspell.Speller('lang', 'en') def gettype(self): return "command" def action(self, complete): msg = complete.message().decode('utf-8') if self.s.check(msg): msg = '"' + msg + '" is spelled correctly' else: suggs = self.s.suggest(msg) if len(suggs) > 0: msg = ', '.join(suggs) else: msg = 'No spelling suggestions.' return ["PRIVMSG $C$ :" + msg] def describe(self, complete): return [ "PRIVMSG $C$ :I am the !spell module", "PRIVMSG $C$ :Usage:", "PRIVMSG $C$ :!spell [word]" ]
'Arabic spell checker based on aspell. The input is a file and the output is errors with frequencies.' ) # type: ArgumentParser parser.add_argument('-i', '--infile', type=argparse.FileType(mode='r', encoding='utf-8'), help='input file.', required=True) parser.add_argument('-o', '--outfile', type=argparse.FileType(mode='w', encoding='utf-8'), help='output file.', required=True) if __name__ == '__main__': ar_spell = aspell.Speller('lang', 'ar') args = parser.parse_args() words = args.infile.read().split() outfile = args.outfile errors_count = dict() for word in words: if not ar_spell.check(word): if word not in errors_count: errors_count[word] = 1 else: errors_count[word] += 1 sorted_freq = sorted(errors_count.items(), key=operator.itemgetter(1), reverse=True) outfile.write('# word\tfreq\tsuggestion\n') for word, freq in sorted_freq:
import spacy import re import emoji from nltk.tokenize import TweetTokenizer from nltk import ngrams import aspell s = aspell.Speller('lang', 'es') nlp = spacy.load('es_core_news_md') tokenize = nlp.tokenizer class Tweet: def __init__(self, text): self.raw_text = text self.clean_text = None self.doc = None def filter(self, *args): """ Apply optional filters 'retweets', 'emoticons', 'handles', 'urls', 'hashtags' and '*' """ exps = [] #this ones can be improved. some are not getting extracted if "retweets" in args: exps.append(re.compile("^RT ?(?<=^|(?<=[^a-zA-Z0-9-_\.]))@([A-Za-z]+[A-Za-z0-9-_]+):")) if "emoticons" in args: exps.append("emoticons") if "flags" in args: exps.append(re.compile(u"[\U0001F1E6-\U0001F1FF]")) if "handles" in args: