class Bot: '''bot''' app = Klein() my_dict = DictWithPWL("en_US", "words.txt") chkr = SpellChecker(my_dict) def __init__(self): self.test = 'hi' @app.route('/parse', methods=['GET']) def parse(self, request): '''parser''' request.setHeader('Content-Type', 'application/json') request_params = { key.decode('utf-8', 'strict'): value[0].decode('utf-8', 'strict') for key, value in request.args.items() } text = str(request_params['q']).strip() self.chkr.set_text(text) for err in self.chkr: err.replace(get_best_word(self.chkr, err.word)) spell_checked = self.chkr.get_text() resp = interpreter.parse(unicode(spell_checked, encoding="utf-8")) print(resp) if (float(resp['intent']['confidence']) > 0.5): reply = {"intent": resp['intent'], "entities": resp['entities']} else: reply = {"intent": {"name": "None"}, "entities": ""} return json.dumps(dict(reply), indent=4)
def addCustomDict(self, customDictPath): try: self._createCustomDictLang(self._folders[-1]) except IOError: pass key = (CUSTOM_DICT_LANG, customDictPath) if key not in self._dictCache: broker = Broker() broker.set_param('enchant.myspell.dictionary.path', self._folders[-1]) try: currentDict = DictWithPWL(CUSTOM_DICT_LANG, customDictPath, broker=broker) except enchant.errors.Error: return self._dictCache[key] = currentDict else: currentDict = self._dictCache[key] self._customCheckers.append(currentDict)
def addCustomDict(self, customDictPath): try: self._createCustomDictLang(self._folders[-1]) except IOError as err: logger.error("Can't create custom dictionary") key = (CUSTOM_DICT_LANG, customDictPath) if key not in self._dictCache: broker = Broker() broker.set_param('enchant.myspell.dictionary.path', self._folders[-1]) try: currentDict = DictWithPWL(CUSTOM_DICT_LANG, customDictPath, broker=broker) except enchant.errors.Error as err: logger.error('Custom dictionary error. path={}; lang={}'.format(customDictPath, key)) logger.error(err) return self._dictCache[key] = currentDict else: currentDict = self._dictCache[key] self._customCheckers.append(currentDict)
def _get_language_checker(self, po_file, reports): """Get checker for PO file language.""" checker = [] if self.spelling: if not ENCHANT_FOUND: raise ImportError('Enchant module not found (please install ' '"pyenchant")') lang = po_file.props['language'] \ if self.spelling == 'str' else 'en' try: with tempfile.NamedTemporaryFile() as tmp_file: tmp_file.write(self.pwl.encode('utf-8')) tmp_file.flush() _dict = DictWithPWL(lang, tmp_file.name) checker.append(SpellChecker(_dict)) except DictNotFoundError: reports.append( PoReport( 'enchant dictionary not found for language "{0}"' ''.format(lang), 'dict', po_file.filename, po_file.props['language_numline'])) checker = [] except IOError as exc: reports.append( PoReport(str(exc), 'pwl', po_file.filename, po_file.props['language_numline'])) checker = [] return checker
def checkAllFiles() : lang = os.environ.get('POOTLE_LANG') spellCheckLang = os.environ.get('SPELLCHECK_LANG') if (os.environ.get('POOTLE_LANG') == None) : print "The POOTLE_LANG variable is not set!" print "Please set it with export POOTLE_LANG=hu before calling this script!" print "The variable should match with the language code on the pootle." return if (spellCheckLang == None) : print "The SPELLCHECK_LANG variable is not set!" print "Please set it with export SPELLCHECK_LANG=hu_HU before calling this script!" print "The variable should match with the language code on the pootle." return pwl = DictWithPWL(spellCheckLang, "known_words_database/" +lang+ ".txt") chkr = SpellChecker(pwl) with open("tools/packagelist.txt") as f: fileList = f.readlines() fileList = [x.strip() for x in fileList] for filename in fileList : checkFile("translations/"+lang+"/" + filename, chkr)
def spellcheck_hints(args, packages): spelldict = DictWithPWL('en-US') chkr = SpellChecker(spelldict, filters=[DescFilter]) misspellings = {} # add technical words not in spell-checking dictionary wordlist = [] with open('words.txt') as f: for w in f: # strip any trailing comment w = re.sub(r'#.*$', '', w) # strip any whitespace w = w.strip() spelldict.add(w) wordlist.append(w.lower()) # XXX: for the moment, to reduce the set of errors, ignore the fact # that words.txt gives a canonical capitalization, and accept any # capitalization spelldict.add(w.lower()) spelldict.add(w.capitalize()) # add all package names as valid words for p in packages: for w in re.split('[_-]', p): # remove punctuation characters w = re.sub(r'[+]', '', w) # strip off any trailing numbers w = re.sub(r'[\d.]*$', '', w) # both with and without any lib prefix for wl in [w, re.sub(r'^lib', '', w)]: # add the package name unless it exists in the list above, which # will give a canonical capitalization if wl.lower() not in wordlist: spelldict.add(wl.lower()) spelldict.add(wl) spelldict.add(wl.capitalize()) # for each package for p in sorted(packages.keys()): # debuginfo packages have uninteresting, auto-generated text which # contains the package name if p.endswith('-debuginfo'): continue # spell-check the spell-checkable keys for k in ['sdesc', 'ldesc', 'message']: if k in packages[p].hints: chkr.set_text(packages[p].hints[k]) # XXX: this is doing all the work to generate suggestions, which # we then ignore, so could be written much more efficiently for err in chkr: # print("package '%s', hint '%s': Is '%s' a word?" % (p, k, err.word)) misspellings.setdefault(err.word, 0) misspellings[err.word] += 1 # summarize for c in sorted(misspellings, key=misspellings.get, reverse=True): print('%16s: %4d' % (c, misspellings[c]))
def _init_spell_checker(self): """ Initialize spell checker dictionary """ default_dict = "en_US" spell_dict = None jargonfile = self.params.get('jargonfile') if not jargonfile: jargonfile = os.environ.get('JARGONFILE') if jargonfile is not None: try: jargonfile = str(jargonfile) spell_dict = DictWithPWL(default_dict, jargonfile) except: self.error( "Could not initialize dictionary using %s file" % jargonfile) if not spell_dict: try: spell_dict = DictWithPWL(default_dict) except: self.error( "Could not initialize spell checker with dictionary %s" % default_dict) #Check if there is jargonfile on module repo url = ("https://src.fedoraproject.org/cgit/modules/%s.git/plain/jargon.txt" % self.mmd.name) resp = requests.get(url) if resp.status_code >= 200 and resp.status_code < 300: for w in resp.content.split("\n"): if w != '': spell_dict.add_to_session(w) #add words from module name as jargon for w in self.mmd.name.split('-'): spell_dict.add_to_session(w) try: chkr = SpellChecker(spell_dict) except: self.error("Could not initialize spell checker") return chkr
def spellCheckHelper(self, row): count = 0 tokenizer = RegexpTokenizer(r'\w+') # enchantDictionary = enchant.Dict("en_US") my_dict = DictWithPWL("en_US", "morewords.txt") my_checker = SpellChecker(my_dict) # use this tokenizer since it eliminates punctuation my_checker.set_text(row['essay']) return len(my_checker)
def spelling(text): my_dict = DictWithPWL("en_US", "myDict.txt") my_checker = SpellChecker(my_dict) my_checker.set_text(text) e = 0 print ' Spelling errors: ' for error in my_checker: print " ", error.word e = e + 1 return e
def find_sug_words(evt): suget_wrd = "" levn_cost = 0 dict, max = {}, 0 tmp = 0 wrd_lst_pair = "" len_mispld_wrd = 0 len_corpus_wrd = 0 target = "" source = "" columns = 0 rows = 0 m = 0 #Word_selec.delete(1.0,END) value = wrd_lstbox.get(ANCHOR) #value = wrd_lstbox.get(wrd_lstbox .curselection()) print("value", value) #Word_selec.insert(INSERT, value) #Word_selec.tag_add("start", "1.0", END) #Word_selec.tag_config("start",font=("Georgia", "12", "bold"), background="yellow") inpt_str = ''.join(Txt_input.get("1.0", END)) word = Txt_input.get("1.0", END) inpt_txt = re.sub("[^\w]", " ", word).split() #print("Input_str",inpt_str) #new_str = inpt_str.replace("iterate",value) #Txt_input.delete(1.0,END) #Txt_input.insert(INSERT,inpt_str.replace("iterate",value)) #print("New string",new_str) # Words suggestion for Missing Words d = DictWithPWL("en_US", "Word Dicitionary4.txt") print(d.check(value)) chk_status = d.check(value) suggst_str = d.suggest(value) sugst_lstbx.delete(0, END) ''' for sug in suggst_str: sugst_lstbx.insert(0,sug) #print("Sugest string",suggst_str) ''' print("Listbox pressed")
def get_spelling_error_count(essay): spelling_error_count = 0 spelling_errors = [] my_dict = DictWithPWL("en_US", "morewords.txt") chkr = SpellChecker(my_dict) chkr.set_text(essay) for err in chkr: spelling_error_count += 1 spelling_errors.append(err.word) return spelling_error_count, list(set(spelling_errors))
def initialise(self, sitecheck): super(Spelling, self).initialise(sitecheck) # Spell checker must be re-created when check is resumed global _enchant_available if _enchant_available: ddp = os.path.dirname(os.path.abspath(__file__)) + 'dict.txt' cdp = self.sitecheck.session.root_path + 'dict.txt' if os.path.exists(cdp): self.dictionary = cdp d = DictWithPWL(self.language, cdp) elif os.path.exists(ddp): self.dictionary = ddp d = DictWithPWL(self.language, ddp) else: d = Dict(self.language) self.spell_checker = SpellChecker(d, filters=[EmailFilter, URLFilter])
def test_DWPWL_empty(tmp_path): """Test functionality of DictWithPWL using transient dicts.""" d = DictWithPWL("en_US", None, None) assert d.check("hello") assert not d.check("helo") assert not d.check("Flagen") d.add("Flagen") assert d.check("Flagen") d.remove("hello") assert not d.check("hello") d.add("hello") assert d.check("hello")
def query_likelihood_35(query, query_id): spell_dict = DictWithPWL("en_US") spell_checker = SpellChecker(spell_dict) rank = 0 QLM_35_dict = {} terms_in_query = query.split() doc_list = [] queryStr = "" for term in terms_in_query: queryStr += term + " " # Enabling Spell checker to find typos in the query spell_checker.set_text(term) spell_list = [] for error in spell_checker: spell_list = error.suggest(error.word) # Means a typo has been detected if len(spell_list) != 0: for word in spell_list: if word in dict_term_unigram_df: term = word break if term in dict_term_unigram_df.keys(): str1 = dict_term_unigram_df[term] str2 = str1.split(",") for x in str2[:-1]: if x.strip() not in doc_list: doc_list.append(x.strip()) for x in doc_list: QLM_35 = calculate_score(query, x, query_id) QLM_35_dict.update({x: QLM_35}) sorted_dict = sorted(QLM_35_dict.items(), key=operator.itemgetter(1)) ranked_data = sorted_dict[::-1][0:100] file = open("Query_Likelihood_Model_0.35_Ranking_with_spell_checking.txt", 'a') file.write( str("^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^" + "\n")) file.write("Query : " + str(queryStr) + "\n \n") for key, value in ranked_data: rank += 1 temp_str = str(query_id) + " " + "Q0" + " " + " " + str( key) + " " + str(rank) + " " + str(value) + " " + "QLM35" + "\n" file.write(temp_str + "\n") file.write( str("^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^" + "\n")) file.close()
def setUp(self): """ Verify required modulemd file parameter has been specified, exists, and can be loaded. The file name and loaded metadata are saved. """ mdfile = self.params.get('modulemd') if mdfile is None: self.error("modulemd parameter must be supplied") mdfile = str(mdfile) if not os.path.isfile(mdfile): self.error("modulemd file %s must exist" % mdfile) try: mmd = modulemd.ModuleMetadata() mmd.load(mdfile) except Exception as ex: self.error( "There was an error while processing modulemd file %s: %s" % (mdfile, ex)) # Infer the module name from the mdfile name and check that it is sane mdfileModuleName, mdfileExtension = os.path.basename(mdfile).split( '.', 1) if (mdfileExtension != 'yaml') and (mdfileExtension != 'yml'): self.error("modulemd file %s must have a .y[a]ml extension" % mdfile) if mmd.name == '': # The name can be missing from the metadata because the builder # knows how to infer it mmd.name = mdfileModuleName elif mmd.name != mdfileModuleName: self.error( "modulemd file name %s and module name %s do not match" % (mdfileModuleName, mmd.name)) self.mdfile = mdfile self.mmd = mmd try: jargonfile = self.params.get('jargonfile') if jargonfile is not None: jargonfile = str(jargonfile) dict = DictWithPWL("en_US", jargonfile) for w in self.mmd.name.split('-'): dict.add_to_session(w) self.chkr = SpellChecker(dict) else: self.chkr = SpellChecker("en_US") except: self.error( "Could not initialize spell checker with dictionary %s" % dict)
def test_pwl(self): """Test checker loop with PWL.""" from enchant import DictWithPWL d = DictWithPWL("en_US", None, None) txt = "I am sme text to be cheked with personal list of cheked words" chkr = SpellChecker(d, txt) for n, err in enumerate(chkr): if n == 0: self.assertEqual(err.word, "sme") if n == 1: self.assertEqual(err.word, "cheked") chkr.add() self.assertEqual(n, 1)
def test_pwl(): """Test checker loop with PWL.""" from enchant import DictWithPWL d = DictWithPWL("en_US", None, None) txt = "I am sme text to be cheked with personal list of cheked words" chkr = SpellChecker(d, txt) for n, err in enumerate(chkr): if n == 0: assert err.word == "sme" if n == 1: assert err.word == "cheked" chkr.add() assert n == 1
def myspell(fname): my_dict = DictWithPWL('en_US', 'mywords.txt') print(my_dict) spell_checker = SpellChecker(my_dict, filters=[EmailFilter, URLFilter]) fp = open(fname, 'r') lc = 1 for x in fp: spell_checker.set_text(x) for error in spell_checker: print("Error:", error.word, lc) lc = lc + 1
def calculate_score(query,doc_id,query_id): spell_dict = DictWithPWL("en_US") spell_checker = SpellChecker(spell_dict) terms_in_query = query.split() bm25_score=0 relevance_docIds = relevance_doc_query(query_id) # R = len(relevance_docIds) # Total number of relevant documents for query. for term_in_query in terms_in_query: try: # Enabling Spell checker to find typos in the query spell_checker.set_text(term_in_query) spell_list = [] for error in spell_checker: spell_list = error.suggest(error.word) # Means a typo has been detected if len(spell_list) != 0: for word in spell_list: if word in dict_term_unigram_df: term_in_query = word break dl = dict_unigram_terms[doc_id] K = k1 * ((1-b) + (b * (dl/avdl))) # ri = relevance_doc_term(term_in_query, dict_term_unigram_df, relevance_docIds) if term_in_query in dict_term_unigram_df: num = dict_term_unigram_df[term_in_query].split(":") else: num = "0" ni = int(num[-1]) str_fi = get_fi(term_in_query,doc_id) if(isinstance(str_fi, str)): fi = float(str_fi.strip(")")) else: fi=0 qfi = terms_in_query.count(term_in_query) exp1 = (((float(ri) + 0.5) / (float(R) - float(ri) + 0.5)) / ((float(ni) - float(ri) + 0.5) / (float(N) - float(ni) - float(R) + float(ri) + 0.5))) exp2 = math.log(exp1) exp3 = (((float(k1) + 1) * float(fi)) / (float(K) + float(fi))) exp4 = (((float(k2) + 1) * float(qfi)) / (float(k2) + float(qfi))) temp_score = exp2*exp3*exp4 bm25_score+=temp_score except Exception as e: print(traceback.format_exc()) pass return bm25_score
def readAmount(imgPath, preprocess): #print(os.path.join(root, name)) image = cv2.imread(imgPath) gray = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY) #Removing some noise kernel = np.ones((1, 1), np.uint8) image = cv2.dilate(image, kernel, iterations=1) image = cv2.erode(image, kernel, iterations=1) if preprocess == "thresh": gray = cv2.threshold(gray, 0, 255, cv2.THRESH_BINARY | cv2.THRESH_OTSU)[1] #make a check to see if median blurring should be done to remove #noise elif preprocess == "blur": gray = cv2.medianBlur(gray, 3) # write the grayscale image to disk as a temporary file so we can # apply OCR to it filename = "{}.png".format(os.getpid()) cv2.imwrite(filename, gray) # load the image, apply OCR, and then delete # the temporary file Spellchecked = '' result = pytesseract.image_to_string(Image.open(filename)) lines = result.split('\n') probableLines = matches(lines) #Spell check and auto-correct the extracted line if len(probableLines) > 0: from enchant.checker import SpellChecker chkr = SpellChecker(DictWithPWL("en_US", "num.txt")) chkr.set_text(probableLines) for err in chkr: sug = err.suggest() if len(sug) > 0: err.replace(sug[0]) Spellchecked = chkr.get_text() words = Spellchecked.split(' ') #remove any unreadable characters star = '*' for word in words: if star in word: Spellchecked = Spellchecked.replace(word, ' ') break os.remove(filename) return (Spellchecked)
def calculate_BM25(query,query_id,query_enrichment, result_folder_path,file): spell_dict = DictWithPWL("en_US") spell_checker = SpellChecker(spell_dict) rank = 0; BM25_dict = {} terms_in_query = query.split() doc_list = [] queryStr = "" for term in terms_in_query: queryStr+=term+" " # Enabling Spell checker to find typos in the query spell_checker.set_text(term) spell_list = [] for error in spell_checker: spell_list = error.suggest(error.word) # Means a typo has been detected if len(spell_list) != 0: for word in spell_list: if word in dict_term_unigram_df: term = word break if term in dict_term_unigram_df.keys(): str1 = dict_term_unigram_df[term] str2 = str1.split(",") for x in str2[:3-1]: if x.strip() not in doc_list: doc_list.append(x.strip()) for x in doc_list: BM25_score = calculate_score(query,x,query_id) BM25_dict.update({x:BM25_score}) sorted_dict = sorted(BM25_dict.items(), key=operator.itemgetter(1)) ranked_data = sorted_dict[::-1][0:100] file.write(str("^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^"+"\n")) file.write("Query : "+str(queryStr)+"\n \n") for key,value in ranked_data: rank+=1; temp_str = str(query_id) + " " + "Q0" + " " + " " + str(key) + " " + str(rank) + " " + str(value) + " " + "BM25" + "\n" file.write(temp_str +"\n") file.write(str("^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^"+"\n"))
def test_DWPWL(tmp_path, pwl_path): """Test functionality of DictWithPWL.""" setPWLContents(pwl_path, ["Sazz", "Lozz"]) other_path = tmp_path / "pel.txt" d = DictWithPWL("en_US", str(pwl_path), str(other_path)) assert d.check("Sazz") assert d.check("Lozz") assert d.check("hello") assert not d.check("helo") assert not d.check("Flagen") d.add("Flagen") assert d.check("Flagen") assert "Flagen" in getPWLContents(pwl_path) assert "Flagen" in d.suggest("Flagn") assert "hello" in d.suggest("helo") d.remove("hello") assert not d.check("hello") assert "hello" not in d.suggest("helo") d.remove("Lozz") assert not d.check("Lozz")
def __init__(self, path, wl_dir, chunkers, filters): self.popath = path self.po = polib.pofile(path) self.lang = self.po.metadata["Language"] available_lang = Broker().list_languages() if self.lang not in available_lang: baselang = self.lang.split("_")[0] if baselang in available_lang: self.lang = baselang else: print("Dictionary for language '%s' could not be found." % self.lang) raise(errors.DictNotFoundError) wordlist = Check.get_wordlist(self.lang, wl_dir, path) try: check_dict = DictWithPWL(self.lang, pwl=wordlist) except errors.Error as e: check_dict = Dict(self.lang) print(e) self.checker = SpellChecker(check_dict, chunkers=chunkers, filters=filters)
def static_analysis(self, path): """ Perform static analysis of the notebook. Read the notebook and check that there is no ouput and that the links in the markdown cells are not broken. Args: path (string): Name of notebook. Return: boolean: True if static analysis succeeded, otherwise False. """ nb = nbformat.read(path, nbformat.current_nbformat) ####################### # Check that the notebook does not contain output from code cells # (should not be in the repository, but well...). ####################### no_unexpected_output = True # Check that the cell dictionary has an 'outputs' key and that it is # empty, relies on Python using short circuit evaluation so that we # don't get KeyError when retrieving the 'outputs' entry. cells_with_output = [ c.source for c in nb.cells if 'outputs' in c and c.outputs ] if cells_with_output: no_unexpected_output = False print( 'Cells with unexpected output:\n_____________________________') for cell in cells_with_output: print(cell + '\n---') else: print('no unexpected output') ####################### # Check that all the links in the markdown cells are valid/accessible. ####################### no_broken_links = True cells_and_broken_links = [] for c in nb.cells: if c.cell_type == 'markdown': html_tree = document_fromstring(markdown.markdown(c.source)) broken_links = [] #iterlinks() returns tuples of the form (element, attribute, link, pos) for document_link in html_tree.iterlinks(): try: if 'http' not in document_link[2]: # Local file. url = 'file://' + os.path.abspath(document_link[2]) else: # Remote file. url = document_link[2] urlopen(url) except URLError: broken_links.append(url) if broken_links: cells_and_broken_links.append((broken_links, c.source)) if cells_and_broken_links: no_broken_links = False print('Cells with broken links:\n________________________') for links, cell in cells_and_broken_links: print(cell + '\n') print('\tBroken links:') print('\t' + '\n\t'.join(links) + '\n---') else: print('no broken links') ####################### # Spell check all markdown cells and comments in code cells using the pyenchant spell checker. ####################### no_spelling_mistakes = True simpleitk_notebooks_dictionary = DictWithPWL( 'en_US', os.path.join(os.path.dirname(os.path.abspath(__file__)), 'additional_dictionary.txt')) spell_checker = SpellChecker(simpleitk_notebooks_dictionary, filters=[EmailFilter, URLFilter]) cells_and_spelling_mistakes = [] for c in nb.cells: spelling_mistakes = [] if c.cell_type == 'markdown': # Get the text as a string from the html without the markup which is replaced by space. spell_checker.set_text(' '.join( etree.XPath('//text()')(document_fromstring( markdown.markdown(c.source))))) elif c.cell_type == 'code': # Get all the comments and concatenate them into a single string separated by newlines. comment_lines = re.findall('#+.*', c.source) spell_checker.set_text('\n'.join(comment_lines)) for error in spell_checker: error_message = 'error: ' + '\'' + error.word + '\', ' + 'suggestions: ' + str( spell_checker.suggest()) spelling_mistakes.append(error_message) if spelling_mistakes: cells_and_spelling_mistakes.append( (spelling_mistakes, c.source)) if cells_and_spelling_mistakes: no_spelling_mistakes = False print('Cells with spelling mistakes:\n________________________') for misspelled_words, cell in cells_and_spelling_mistakes: print(cell + '\n') print('\tMisspelled words and suggestions:') print('\t' + '\n\t'.join(misspelled_words) + '\n---') else: print('no spelling mistakes') return (no_unexpected_output and no_broken_links and no_spelling_mistakes)
from enchant import DictWithPWL from enchant.checker import SpellChecker my_dict = DictWithPWL("en_US", "myDict.txt") my_checker = SpellChecker(my_dict) with open('test_copy.txt', 'r') as f: f_contents = f.read().decode("utf-8-sig").encode( "utf-8") #decode the contents to unicode and encode to utf-8 my_checker.set_text(f_contents) e = 0 for error in my_checker: print "ERROR:", error.word e = e + 1 print('No. of errors: ', e) ''' import enchant import wx from enchant.checker import SpellChecker from enchant.checker.wxSpellCheckerDialog import wxSpellCheckerDialog from enchant.checker.CmdLineChecker import CmdLineChecker a = "Cats are animalss. " \ "They are violenttt." chkr = enchant.checker.SpellChecker("en_US") chkr.set_text(a) for err in chkr: print err.word sug = err.suggest()[0] err.replace(sug)
def main(): config = configparser.ConfigParser() config.read('setup.cfg') conf = config['potypo'] chunker_list = [] for chunker in conf['chunkers'].strip().split(","): if "." in chunker: components = chunker.rsplit('.', 1) mod = __import__(components[0], fromlist=[components[1]]) class_object = getattr(mod, components[1]) else: class_object = getattr(chunkers, chunker) chunker_list.append(class_object) filter_list = [] for f in conf['filters'].strip().split(","): if "." in f: components = f.rsplit('.', 1) mod = __import__(components[0], fromlist=[components[1]]) class_object = getattr(mod, components[1]) else: class_object = getattr(filters, f) filter_list.append(class_object) if 'phrases' in conf: phrases = conf['phrases'].strip().split('\n') chunker_list.append(chunkers.make_PhraseChunker(phrases)) if 'edgecase_words' in conf: words = conf['edgecase_words'].strip().split('\n') filter_list.append(filters.make_EdgecaseFilter(words)) def errmsg(path, linenum, word): print("ERROR: {}:{}: {}".format(path, linenum, word)) # checks contains one Check-Object for every po-file checks = [] for root, dirs, files in os.walk(conf['locales_dir']): for f in files: if f.endswith(".po"): try: checks.append( Check(os.path.join(root, f), conf['wl_dir'], chunker_list, filter_list)) except errors.DictNotFoundError as err: print( err, "Potypo will not check for spelling errors in this language." ) en_wordlist = Check.get_wordlist(conf['default_language'], conf['wl_dir'], conf['locales_dir']) en_dict = DictWithPWL(conf['default_language'], pwl=en_wordlist) en_ckr = SpellChecker(en_dict, chunkers=chunker_list, filters=filter_list) fail = False # used for tracking whether failing errors occurred for c in checks: print("Checking Errors in file", c.popath, "for lang", c.lang) for entry in c.po: if entry.obsolete: continue en_ckr.set_text(entry.msgid) for err in en_ckr: fail = True path = os.path.relpath(c.popath, start=config['potypo']['locales_dir']) errmsg(path, entry.linenum, err.word) c.checker.set_text(entry.msgstr) for err in c.checker: if c.lang not in conf['no_fail']: fail = True path = os.path.relpath(c.popath, start=config['potypo']['locales_dir']) errmsg(path, entry.linenum, err.word) print("Spell-checking done.") if fail: sys.exit(1) sys.exit(0)
lines = f.read().splitlines() # You better not have more than 1 word in a line for wrd in lines: if not enchant_dict.check(wrd): enchant_dict.add_to_pwl(wrd) if __name__ == '__main__': args = parse_args() # print(args) thisdir = os.path.dirname(os.path.abspath(__file__)) sitk_dict = DictWithPWL('en_US', thisdir + '/additional_dictionary.txt') if args.dict is not None: for d in args.dict: add_dict(sitk_dict, d) spell_checker = SpellChecker(sitk_dict, filters=[EmailFilter, URLFilter]) output_lvl = 1 if args.brief: output_lvl = 0 else: if args.verbose: output_lvl = 2 if args.miss: output_lvl = -1
from enchant import DictWithPWL my_dict = DictWithPWL("pt_BR") for x in my_dict: print(x)
# -*- encoding: utf-8 -*- import freeling import os from enchant import DictWithPWL from enchant.checker import SpellChecker from difflib import get_close_matches, SequenceMatcher DATA = "/usr/local/share/freeling/" LANG = "es" assert os.path.getsize('../utilities/es-lat') > 0 my_dict = DictWithPWL('es', '../utilities/es-lat') assert my_dict.provider.name == 'aspell' chkr = SpellChecker(my_dict) class Analyzer: def __init__(self): freeling.util_init_locale("default") # Create options set for maco analyzer op = freeling.maco_options(LANG) op.PunctuationFile = DATA + "common/punct.dat" op.DictionaryFile = DATA + LANG + "/es-ar/dicc.src" op.AffixFile = DATA + LANG + "/afixos.dat" op.LocutionsFile = DATA + LANG + "/locucions.dat" op.NPdataFile = DATA + LANG + "/np.dat" op.QuantitiesFile = DATA + LANG + "/quantities.dat" op.ProbabilityFile = DATA + LANG + "/probabilitats.dat"
def add_dict(enchant_dict, filename): with open(filename) as f: lines = f.read().splitlines() # You better not have more than 1 word in a line for wrd in lines: if not enchant_dict.check(wrd): enchant_dict.add_to_pwl(wrd) if __name__ == '__main__': args = parse_args() # print(args) sitk_dict = DictWithPWL('en_US', 'additional_dictionary.txt') if args.dict is not None: for d in args.dict: add_dict(sitk_dict, d) spell_checker = SpellChecker(sitk_dict, filters=[EmailFilter, URLFilter]) output_lvl = 1 if args.brief: output_lvl = 0 else: if args.verbose: output_lvl = 2 if args.miss: output_lvl = -1