def find_sug_words(evt): suget_wrd = "" levn_cost = 0 dict, max = {}, 0 tmp = 0 wrd_lst_pair = "" len_mispld_wrd = 0 len_corpus_wrd = 0 target = "" source = "" columns = 0 rows = 0 m = 0 #Word_selec.delete(1.0,END) value = wrd_lstbox.get(ANCHOR) #value = wrd_lstbox.get(wrd_lstbox .curselection()) print("value", value) #Word_selec.insert(INSERT, value) #Word_selec.tag_add("start", "1.0", END) #Word_selec.tag_config("start",font=("Georgia", "12", "bold"), background="yellow") inpt_str = ''.join(Txt_input.get("1.0", END)) word = Txt_input.get("1.0", END) inpt_txt = re.sub("[^\w]", " ", word).split() #print("Input_str",inpt_str) #new_str = inpt_str.replace("iterate",value) #Txt_input.delete(1.0,END) #Txt_input.insert(INSERT,inpt_str.replace("iterate",value)) #print("New string",new_str) # Words suggestion for Missing Words d = DictWithPWL("en_US", "Word Dicitionary4.txt") print(d.check(value)) chk_status = d.check(value) suggst_str = d.suggest(value) sugst_lstbx.delete(0, END) ''' for sug in suggst_str: sugst_lstbx.insert(0,sug) #print("Sugest string",suggst_str) ''' print("Listbox pressed")
def setUp(self): """ Verify required modulemd file parameter has been specified, exists, and can be loaded. The file name and loaded metadata are saved. """ mdfile = self.params.get('modulemd') if mdfile is None: self.error("modulemd parameter must be supplied") mdfile = str(mdfile) if not os.path.isfile(mdfile): self.error("modulemd file %s must exist" % mdfile) try: mmd = modulemd.ModuleMetadata() mmd.load(mdfile) except Exception as ex: self.error( "There was an error while processing modulemd file %s: %s" % (mdfile, ex)) # Infer the module name from the mdfile name and check that it is sane mdfileModuleName, mdfileExtension = os.path.basename(mdfile).split( '.', 1) if (mdfileExtension != 'yaml') and (mdfileExtension != 'yml'): self.error("modulemd file %s must have a .y[a]ml extension" % mdfile) if mmd.name == '': # The name can be missing from the metadata because the builder # knows how to infer it mmd.name = mdfileModuleName elif mmd.name != mdfileModuleName: self.error( "modulemd file name %s and module name %s do not match" % (mdfileModuleName, mmd.name)) self.mdfile = mdfile self.mmd = mmd try: jargonfile = self.params.get('jargonfile') if jargonfile is not None: jargonfile = str(jargonfile) dict = DictWithPWL("en_US", jargonfile) for w in self.mmd.name.split('-'): dict.add_to_session(w) self.chkr = SpellChecker(dict) else: self.chkr = SpellChecker("en_US") except: self.error( "Could not initialize spell checker with dictionary %s" % dict)
def checkAllFiles() : lang = os.environ.get('POOTLE_LANG') spellCheckLang = os.environ.get('SPELLCHECK_LANG') if (os.environ.get('POOTLE_LANG') == None) : print "The POOTLE_LANG variable is not set!" print "Please set it with export POOTLE_LANG=hu before calling this script!" print "The variable should match with the language code on the pootle." return if (spellCheckLang == None) : print "The SPELLCHECK_LANG variable is not set!" print "Please set it with export SPELLCHECK_LANG=hu_HU before calling this script!" print "The variable should match with the language code on the pootle." return pwl = DictWithPWL(spellCheckLang, "known_words_database/" +lang+ ".txt") chkr = SpellChecker(pwl) with open("tools/packagelist.txt") as f: fileList = f.readlines() fileList = [x.strip() for x in fileList] for filename in fileList : checkFile("translations/"+lang+"/" + filename, chkr)
class Bot: '''bot''' app = Klein() my_dict = DictWithPWL("en_US", "words.txt") chkr = SpellChecker(my_dict) def __init__(self): self.test = 'hi' @app.route('/parse', methods=['GET']) def parse(self, request): '''parser''' request.setHeader('Content-Type', 'application/json') request_params = { key.decode('utf-8', 'strict'): value[0].decode('utf-8', 'strict') for key, value in request.args.items() } text = str(request_params['q']).strip() self.chkr.set_text(text) for err in self.chkr: err.replace(get_best_word(self.chkr, err.word)) spell_checked = self.chkr.get_text() resp = interpreter.parse(unicode(spell_checked, encoding="utf-8")) print(resp) if (float(resp['intent']['confidence']) > 0.5): reply = {"intent": resp['intent'], "entities": resp['entities']} else: reply = {"intent": {"name": "None"}, "entities": ""} return json.dumps(dict(reply), indent=4)
def addCustomDict(self, customDictPath): try: self._createCustomDictLang(self._folders[-1]) except IOError as err: logger.error("Can't create custom dictionary") key = (CUSTOM_DICT_LANG, customDictPath) if key not in self._dictCache: broker = Broker() broker.set_param('enchant.myspell.dictionary.path', self._folders[-1]) try: currentDict = DictWithPWL(CUSTOM_DICT_LANG, customDictPath, broker=broker) except enchant.errors.Error as err: logger.error('Custom dictionary error. path={}; lang={}'.format(customDictPath, key)) logger.error(err) return self._dictCache[key] = currentDict else: currentDict = self._dictCache[key] self._customCheckers.append(currentDict)
def addCustomDict(self, customDictPath): try: self._createCustomDictLang(self._folders[-1]) except IOError: pass key = (CUSTOM_DICT_LANG, customDictPath) if key not in self._dictCache: broker = Broker() broker.set_param('enchant.myspell.dictionary.path', self._folders[-1]) try: currentDict = DictWithPWL(CUSTOM_DICT_LANG, customDictPath, broker=broker) except enchant.errors.Error: return self._dictCache[key] = currentDict else: currentDict = self._dictCache[key] self._customCheckers.append(currentDict)
def _get_language_checker(self, po_file, reports): """Get checker for PO file language.""" checker = [] if self.spelling: if not ENCHANT_FOUND: raise ImportError('Enchant module not found (please install ' '"pyenchant")') lang = po_file.props['language'] \ if self.spelling == 'str' else 'en' try: with tempfile.NamedTemporaryFile() as tmp_file: tmp_file.write(self.pwl.encode('utf-8')) tmp_file.flush() _dict = DictWithPWL(lang, tmp_file.name) checker.append(SpellChecker(_dict)) except DictNotFoundError: reports.append( PoReport( 'enchant dictionary not found for language "{0}"' ''.format(lang), 'dict', po_file.filename, po_file.props['language_numline'])) checker = [] except IOError as exc: reports.append( PoReport(str(exc), 'pwl', po_file.filename, po_file.props['language_numline'])) checker = [] return checker
def spellCheckHelper(self, row): count = 0 tokenizer = RegexpTokenizer(r'\w+') # enchantDictionary = enchant.Dict("en_US") my_dict = DictWithPWL("en_US", "morewords.txt") my_checker = SpellChecker(my_dict) # use this tokenizer since it eliminates punctuation my_checker.set_text(row['essay']) return len(my_checker)
def spelling(text): my_dict = DictWithPWL("en_US", "myDict.txt") my_checker = SpellChecker(my_dict) my_checker.set_text(text) e = 0 print ' Spelling errors: ' for error in my_checker: print " ", error.word e = e + 1 return e
def initialise(self, sitecheck): super(Spelling, self).initialise(sitecheck) # Spell checker must be re-created when check is resumed global _enchant_available if _enchant_available: ddp = os.path.dirname(os.path.abspath(__file__)) + 'dict.txt' cdp = self.sitecheck.session.root_path + 'dict.txt' if os.path.exists(cdp): self.dictionary = cdp d = DictWithPWL(self.language, cdp) elif os.path.exists(ddp): self.dictionary = ddp d = DictWithPWL(self.language, ddp) else: d = Dict(self.language) self.spell_checker = SpellChecker(d, filters=[EmailFilter, URLFilter])
def get_spelling_error_count(essay): spelling_error_count = 0 spelling_errors = [] my_dict = DictWithPWL("en_US", "morewords.txt") chkr = SpellChecker(my_dict) chkr.set_text(essay) for err in chkr: spelling_error_count += 1 spelling_errors.append(err.word) return spelling_error_count, list(set(spelling_errors))
def query_likelihood_35(query, query_id): spell_dict = DictWithPWL("en_US") spell_checker = SpellChecker(spell_dict) rank = 0 QLM_35_dict = {} terms_in_query = query.split() doc_list = [] queryStr = "" for term in terms_in_query: queryStr += term + " " # Enabling Spell checker to find typos in the query spell_checker.set_text(term) spell_list = [] for error in spell_checker: spell_list = error.suggest(error.word) # Means a typo has been detected if len(spell_list) != 0: for word in spell_list: if word in dict_term_unigram_df: term = word break if term in dict_term_unigram_df.keys(): str1 = dict_term_unigram_df[term] str2 = str1.split(",") for x in str2[:-1]: if x.strip() not in doc_list: doc_list.append(x.strip()) for x in doc_list: QLM_35 = calculate_score(query, x, query_id) QLM_35_dict.update({x: QLM_35}) sorted_dict = sorted(QLM_35_dict.items(), key=operator.itemgetter(1)) ranked_data = sorted_dict[::-1][0:100] file = open("Query_Likelihood_Model_0.35_Ranking_with_spell_checking.txt", 'a') file.write( str("^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^" + "\n")) file.write("Query : " + str(queryStr) + "\n \n") for key, value in ranked_data: rank += 1 temp_str = str(query_id) + " " + "Q0" + " " + " " + str( key) + " " + str(rank) + " " + str(value) + " " + "QLM35" + "\n" file.write(temp_str + "\n") file.write( str("^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^" + "\n")) file.close()
def test_pwl(self): """Test checker loop with PWL.""" from enchant import DictWithPWL d = DictWithPWL("en_US", None, None) txt = "I am sme text to be cheked with personal list of cheked words" chkr = SpellChecker(d, txt) for n, err in enumerate(chkr): if n == 0: self.assertEqual(err.word, "sme") if n == 1: self.assertEqual(err.word, "cheked") chkr.add() self.assertEqual(n, 1)
def myspell(fname): my_dict = DictWithPWL('en_US', 'mywords.txt') print(my_dict) spell_checker = SpellChecker(my_dict, filters=[EmailFilter, URLFilter]) fp = open(fname, 'r') lc = 1 for x in fp: spell_checker.set_text(x) for error in spell_checker: print("Error:", error.word, lc) lc = lc + 1
def __init__(self, mydict=None, lang='it_IT'): """[str] [,str] Ottiene l'eventuale elenco di parole personalizzate da integrare al dizionario ed il linguaggio da applicare - predefinito Italiano Solleva una eccezione se `mydict` non è accessibile """ self._lang = lang self._custom_dict = mydict try: self._chkr = SpellChecker(lang, filters=[EmailFilter, URLFilter]) self._pwl = DictWithPWL(lang, mydict) if mydict else None except enchant.errors.DictNotFoundError as nodict_err: raise SpellCheckError("Dizionario " + lang + " non trovato")
def test_pwl(): """Test checker loop with PWL.""" from enchant import DictWithPWL d = DictWithPWL("en_US", None, None) txt = "I am sme text to be cheked with personal list of cheked words" chkr = SpellChecker(d, txt) for n, err in enumerate(chkr): if n == 0: assert err.word == "sme" if n == 1: assert err.word == "cheked" chkr.add() assert n == 1
def calculate_score(query,doc_id,query_id): spell_dict = DictWithPWL("en_US") spell_checker = SpellChecker(spell_dict) terms_in_query = query.split() bm25_score=0 relevance_docIds = relevance_doc_query(query_id) # R = len(relevance_docIds) # Total number of relevant documents for query. for term_in_query in terms_in_query: try: # Enabling Spell checker to find typos in the query spell_checker.set_text(term_in_query) spell_list = [] for error in spell_checker: spell_list = error.suggest(error.word) # Means a typo has been detected if len(spell_list) != 0: for word in spell_list: if word in dict_term_unigram_df: term_in_query = word break dl = dict_unigram_terms[doc_id] K = k1 * ((1-b) + (b * (dl/avdl))) # ri = relevance_doc_term(term_in_query, dict_term_unigram_df, relevance_docIds) if term_in_query in dict_term_unigram_df: num = dict_term_unigram_df[term_in_query].split(":") else: num = "0" ni = int(num[-1]) str_fi = get_fi(term_in_query,doc_id) if(isinstance(str_fi, str)): fi = float(str_fi.strip(")")) else: fi=0 qfi = terms_in_query.count(term_in_query) exp1 = (((float(ri) + 0.5) / (float(R) - float(ri) + 0.5)) / ((float(ni) - float(ri) + 0.5) / (float(N) - float(ni) - float(R) + float(ri) + 0.5))) exp2 = math.log(exp1) exp3 = (((float(k1) + 1) * float(fi)) / (float(K) + float(fi))) exp4 = (((float(k2) + 1) * float(qfi)) / (float(k2) + float(qfi))) temp_score = exp2*exp3*exp4 bm25_score+=temp_score except Exception as e: print(traceback.format_exc()) pass return bm25_score
def readAmount(imgPath, preprocess): #print(os.path.join(root, name)) image = cv2.imread(imgPath) gray = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY) #Removing some noise kernel = np.ones((1, 1), np.uint8) image = cv2.dilate(image, kernel, iterations=1) image = cv2.erode(image, kernel, iterations=1) if preprocess == "thresh": gray = cv2.threshold(gray, 0, 255, cv2.THRESH_BINARY | cv2.THRESH_OTSU)[1] #make a check to see if median blurring should be done to remove #noise elif preprocess == "blur": gray = cv2.medianBlur(gray, 3) # write the grayscale image to disk as a temporary file so we can # apply OCR to it filename = "{}.png".format(os.getpid()) cv2.imwrite(filename, gray) # load the image, apply OCR, and then delete # the temporary file Spellchecked = '' result = pytesseract.image_to_string(Image.open(filename)) lines = result.split('\n') probableLines = matches(lines) #Spell check and auto-correct the extracted line if len(probableLines) > 0: from enchant.checker import SpellChecker chkr = SpellChecker(DictWithPWL("en_US", "num.txt")) chkr.set_text(probableLines) for err in chkr: sug = err.suggest() if len(sug) > 0: err.replace(sug[0]) Spellchecked = chkr.get_text() words = Spellchecked.split(' ') #remove any unreadable characters star = '*' for word in words: if star in word: Spellchecked = Spellchecked.replace(word, ' ') break os.remove(filename) return (Spellchecked)
def __init__(self): f = open(str(os.getcwd()) + "/Google_IITB/data/keyweights.db", "rb") self.keyweights = pickle.load(f) f.close() f = open(str(os.getcwd()) + "/Google_IITB/data/data.db", "rb") self.hash_table = pickle.load(f) f.close() with open(str(os.getcwd()) + "/Google_IITB/data/pageranks.db","rb") as fp: self.ranks = pickle.load(fp) with open(str(os.getcwd()) + "/Google_IITB/data/titles.db","rb") as fp: self.titles = pickle.load(fp) self.d = DictWithPWL("en_US", str(os.getcwd()) + "/Google_IITB/data/allkeys.txt")
def _init_spell_checker(self): """ Initialize spell checker dictionary """ default_dict = "en_US" spell_dict = None jargonfile = self.params.get('jargonfile') if not jargonfile: jargonfile = os.environ.get('JARGONFILE') if jargonfile is not None: try: jargonfile = str(jargonfile) spell_dict = DictWithPWL(default_dict, jargonfile) except: self.error( "Could not initialize dictionary using %s file" % jargonfile) if not spell_dict: try: spell_dict = DictWithPWL(default_dict) except: self.error( "Could not initialize spell checker with dictionary %s" % default_dict) #Check if there is jargonfile on module repo url = ("https://src.fedoraproject.org/cgit/modules/%s.git/plain/jargon.txt" % self.mmd.name) resp = requests.get(url) if resp.status_code >= 200 and resp.status_code < 300: for w in resp.content.split("\n"): if w != '': spell_dict.add_to_session(w) #add words from module name as jargon for w in self.mmd.name.split('-'): spell_dict.add_to_session(w) try: chkr = SpellChecker(spell_dict) except: self.error("Could not initialize spell checker") return chkr
def calculate_BM25(query,query_id,query_enrichment, result_folder_path,file): spell_dict = DictWithPWL("en_US") spell_checker = SpellChecker(spell_dict) rank = 0; BM25_dict = {} terms_in_query = query.split() doc_list = [] queryStr = "" for term in terms_in_query: queryStr+=term+" " # Enabling Spell checker to find typos in the query spell_checker.set_text(term) spell_list = [] for error in spell_checker: spell_list = error.suggest(error.word) # Means a typo has been detected if len(spell_list) != 0: for word in spell_list: if word in dict_term_unigram_df: term = word break if term in dict_term_unigram_df.keys(): str1 = dict_term_unigram_df[term] str2 = str1.split(",") for x in str2[:3-1]: if x.strip() not in doc_list: doc_list.append(x.strip()) for x in doc_list: BM25_score = calculate_score(query,x,query_id) BM25_dict.update({x:BM25_score}) sorted_dict = sorted(BM25_dict.items(), key=operator.itemgetter(1)) ranked_data = sorted_dict[::-1][0:100] file.write(str("^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^"+"\n")) file.write("Query : "+str(queryStr)+"\n \n") for key,value in ranked_data: rank+=1; temp_str = str(query_id) + " " + "Q0" + " " + " " + str(key) + " " + str(rank) + " " + str(value) + " " + "BM25" + "\n" file.write(temp_str +"\n") file.write(str("^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^"+"\n"))
def __init__(self, path, wl_dir, chunkers, filters): self.popath = path self.po = polib.pofile(path) self.lang = self.po.metadata["Language"] available_lang = Broker().list_languages() if self.lang not in available_lang: baselang = self.lang.split("_")[0] if baselang in available_lang: self.lang = baselang else: print("Dictionary for language '%s' could not be found." % self.lang) raise(errors.DictNotFoundError) wordlist = Check.get_wordlist(self.lang, wl_dir, path) try: check_dict = DictWithPWL(self.lang, pwl=wordlist) except errors.Error as e: check_dict = Dict(self.lang) print(e) self.checker = SpellChecker(check_dict, chunkers=chunkers, filters=filters)
# -*- encoding: utf-8 -*- import freeling import os from enchant import DictWithPWL from enchant.checker import SpellChecker from difflib import get_close_matches, SequenceMatcher DATA = "/usr/local/share/freeling/" LANG = "es" assert os.path.getsize('../utilities/es-lat') > 0 my_dict = DictWithPWL('es', '../utilities/es-lat') assert my_dict.provider.name == 'aspell' chkr = SpellChecker(my_dict) class Analyzer: def __init__(self): freeling.util_init_locale("default") # Create options set for maco analyzer op = freeling.maco_options(LANG) op.PunctuationFile = DATA + "common/punct.dat" op.DictionaryFile = DATA + LANG + "/es-ar/dicc.src" op.AffixFile = DATA + LANG + "/afixos.dat" op.LocutionsFile = DATA + LANG + "/locucions.dat" op.NPdataFile = DATA + LANG + "/np.dat" op.QuantitiesFile = DATA + LANG + "/quantities.dat" op.ProbabilityFile = DATA + LANG + "/probabilitats.dat"
class EnchantProxy(object): """Wrapper alla libreria enchant""" def __init__(self, mydict=None, lang='it_IT'): """[str] [,str] Ottiene l'eventuale elenco di parole personalizzate da integrare al dizionario ed il linguaggio da applicare - predefinito Italiano Solleva una eccezione se `mydict` non è accessibile """ self._lang = lang self._custom_dict = mydict try: self._chkr = SpellChecker(lang, filters=[EmailFilter, URLFilter]) self._pwl = DictWithPWL(lang, mydict) if mydict else None except enchant.errors.DictNotFoundError as nodict_err: raise SpellCheckError("Dizionario " + lang + " non trovato") def check(self, text, chunk_idx): """(str, int) -> list of `Error` Esegue il controllo per `testo` e ritorna una lista di oggetti `Errore` con la parola errata e la lista dei suggerimenti. Se la parola non viene trovata viene effettuata una ricerca anche nel dizionario personale (`self._pwl`) se definito `chunk_idx` è l'identificativo del testo da elaborare """ errors = [] self._chkr.set_text(text) for err in self._chkr: if self._pwl and self._pwl.check(err.word): continue error = Error(err.word, self._chkr.suggest(err.word), chunk_idx) error.context = text errors.append(error) return errors def upd_mydict(self, word): """(str) Aggiunge la parola `word` al dizionario personalizzato (attiva per la prossima chiamata a `check`. **L'aggiunta viene fatta solo al dizionario personalizzato IN MEMORIA Utilizzare `add_custom_word` per l'aggiornamento del dizionario personalizzato su disco** """ if not self._pwl: return if self._pwl.is_added(word): raise SpellCheckError("Parola già esistente") self._pwl.add(word) def add_custom_words(self, words): """(list of str) Aggiunge le parole in ``words`` al dizionario personalizzato """ if not self._custom_dict: raise SpellCheckError("Dizionario personalizzato non presente") orig_words = codecs.open(self._custom_dict, encoding='utf-8').split("\n") orig_words.extend([w for w in words if w not in orig_words]) codecs.open( self._custom_dict, mode='w', encoding='utf-8' ).write("\n".join(orig_words))
lines = f.read().splitlines() # You better not have more than 1 word in a line for wrd in lines: if not enchant_dict.check(wrd): enchant_dict.add_to_pwl(wrd) if __name__ == '__main__': args = parse_args() # print(args) thisdir = os.path.dirname(os.path.abspath(__file__)) sitk_dict = DictWithPWL('en_US', thisdir + '/additional_dictionary.txt') if args.dict is not None: for d in args.dict: add_dict(sitk_dict, d) spell_checker = SpellChecker(sitk_dict, filters=[EmailFilter, URLFilter]) output_lvl = 1 if args.brief: output_lvl = 0 else: if args.verbose: output_lvl = 2 if args.miss: output_lvl = -1
from enchant import DictWithPWL from enchant.checker import SpellChecker my_dict = DictWithPWL("en_US", "myDict.txt") my_checker = SpellChecker(my_dict) with open('test_copy.txt', 'r') as f: f_contents = f.read().decode("utf-8-sig").encode( "utf-8") #decode the contents to unicode and encode to utf-8 my_checker.set_text(f_contents) e = 0 for error in my_checker: print "ERROR:", error.word e = e + 1 print('No. of errors: ', e) ''' import enchant import wx from enchant.checker import SpellChecker from enchant.checker.wxSpellCheckerDialog import wxSpellCheckerDialog from enchant.checker.CmdLineChecker import CmdLineChecker a = "Cats are animalss. " \ "They are violenttt." chkr = enchant.checker.SpellChecker("en_US") chkr.set_text(a) for err in chkr: print err.word sug = err.suggest()[0] err.replace(sug)
def main(): config = configparser.ConfigParser() config.read('setup.cfg') conf = config['potypo'] chunker_list = [] for chunker in conf['chunkers'].strip().split(","): if "." in chunker: components = chunker.rsplit('.', 1) mod = __import__(components[0], fromlist=[components[1]]) class_object = getattr(mod, components[1]) else: class_object = getattr(chunkers, chunker) chunker_list.append(class_object) filter_list = [] for f in conf['filters'].strip().split(","): if "." in f: components = f.rsplit('.', 1) mod = __import__(components[0], fromlist=[components[1]]) class_object = getattr(mod, components[1]) else: class_object = getattr(filters, f) filter_list.append(class_object) if 'phrases' in conf: phrases = conf['phrases'].strip().split('\n') chunker_list.append(chunkers.make_PhraseChunker(phrases)) if 'edgecase_words' in conf: words = conf['edgecase_words'].strip().split('\n') filter_list.append(filters.make_EdgecaseFilter(words)) def errmsg(path, linenum, word): print("ERROR: {}:{}: {}".format(path, linenum, word)) # checks contains one Check-Object for every po-file checks = [] for root, dirs, files in os.walk(conf['locales_dir']): for f in files: if f.endswith(".po"): try: checks.append( Check(os.path.join(root, f), conf['wl_dir'], chunker_list, filter_list)) except errors.DictNotFoundError as err: print( err, "Potypo will not check for spelling errors in this language." ) en_wordlist = Check.get_wordlist(conf['default_language'], conf['wl_dir'], conf['locales_dir']) en_dict = DictWithPWL(conf['default_language'], pwl=en_wordlist) en_ckr = SpellChecker(en_dict, chunkers=chunker_list, filters=filter_list) fail = False # used for tracking whether failing errors occurred for c in checks: print("Checking Errors in file", c.popath, "for lang", c.lang) for entry in c.po: if entry.obsolete: continue en_ckr.set_text(entry.msgid) for err in en_ckr: fail = True path = os.path.relpath(c.popath, start=config['potypo']['locales_dir']) errmsg(path, entry.linenum, err.word) c.checker.set_text(entry.msgstr) for err in c.checker: if c.lang not in conf['no_fail']: fail = True path = os.path.relpath(c.popath, start=config['potypo']['locales_dir']) errmsg(path, entry.linenum, err.word) print("Spell-checking done.") if fail: sys.exit(1) sys.exit(0)
from enchant import DictWithPWL my_dict = DictWithPWL("pt_BR") for x in my_dict: print(x)
def test_DWPWL(tmp_path, pwl_path): """Test functionality of DictWithPWL.""" setPWLContents(pwl_path, ["Sazz", "Lozz"]) other_path = tmp_path / "pel.txt" d = DictWithPWL("en_US", str(pwl_path), str(other_path)) assert d.check("Sazz") assert d.check("Lozz") assert d.check("hello") assert not d.check("helo") assert not d.check("Flagen") d.add("Flagen") assert d.check("Flagen") assert "Flagen" in getPWLContents(pwl_path) assert "Flagen" in d.suggest("Flagn") assert "hello" in d.suggest("helo") d.remove("hello") assert not d.check("hello") assert "hello" not in d.suggest("helo") d.remove("Lozz") assert not d.check("Lozz")
def test_DWPWL_empty(tmp_path): """Test functionality of DictWithPWL using transient dicts.""" d = DictWithPWL("en_US", None, None) assert d.check("hello") assert not d.check("helo") assert not d.check("Flagen") d.add("Flagen") assert d.check("Flagen") d.remove("hello") assert not d.check("hello") d.add("hello") assert d.check("hello")
def static_analysis(self, path): """ Perform static analysis of the notebook. Read the notebook and check that there is no ouput and that the links in the markdown cells are not broken. Args: path (string): Name of notebook. Return: boolean: True if static analysis succeeded, otherwise False. """ nb = nbformat.read(path, nbformat.current_nbformat) ####################### # Check that the notebook does not contain output from code cells # (should not be in the repository, but well...). ####################### no_unexpected_output = True # Check that the cell dictionary has an 'outputs' key and that it is # empty, relies on Python using short circuit evaluation so that we # don't get KeyError when retrieving the 'outputs' entry. cells_with_output = [ c.source for c in nb.cells if 'outputs' in c and c.outputs ] if cells_with_output: no_unexpected_output = False print( 'Cells with unexpected output:\n_____________________________') for cell in cells_with_output: print(cell + '\n---') else: print('no unexpected output') ####################### # Check that all the links in the markdown cells are valid/accessible. ####################### no_broken_links = True cells_and_broken_links = [] for c in nb.cells: if c.cell_type == 'markdown': html_tree = document_fromstring(markdown.markdown(c.source)) broken_links = [] #iterlinks() returns tuples of the form (element, attribute, link, pos) for document_link in html_tree.iterlinks(): try: if 'http' not in document_link[2]: # Local file. url = 'file://' + os.path.abspath(document_link[2]) else: # Remote file. url = document_link[2] urlopen(url) except URLError: broken_links.append(url) if broken_links: cells_and_broken_links.append((broken_links, c.source)) if cells_and_broken_links: no_broken_links = False print('Cells with broken links:\n________________________') for links, cell in cells_and_broken_links: print(cell + '\n') print('\tBroken links:') print('\t' + '\n\t'.join(links) + '\n---') else: print('no broken links') ####################### # Spell check all markdown cells and comments in code cells using the pyenchant spell checker. ####################### no_spelling_mistakes = True simpleitk_notebooks_dictionary = DictWithPWL( 'en_US', os.path.join(os.path.dirname(os.path.abspath(__file__)), 'additional_dictionary.txt')) spell_checker = SpellChecker(simpleitk_notebooks_dictionary, filters=[EmailFilter, URLFilter]) cells_and_spelling_mistakes = [] for c in nb.cells: spelling_mistakes = [] if c.cell_type == 'markdown': # Get the text as a string from the html without the markup which is replaced by space. spell_checker.set_text(' '.join( etree.XPath('//text()')(document_fromstring( markdown.markdown(c.source))))) elif c.cell_type == 'code': # Get all the comments and concatenate them into a single string separated by newlines. comment_lines = re.findall('#+.*', c.source) spell_checker.set_text('\n'.join(comment_lines)) for error in spell_checker: error_message = 'error: ' + '\'' + error.word + '\', ' + 'suggestions: ' + str( spell_checker.suggest()) spelling_mistakes.append(error_message) if spelling_mistakes: cells_and_spelling_mistakes.append( (spelling_mistakes, c.source)) if cells_and_spelling_mistakes: no_spelling_mistakes = False print('Cells with spelling mistakes:\n________________________') for misspelled_words, cell in cells_and_spelling_mistakes: print(cell + '\n') print('\tMisspelled words and suggestions:') print('\t' + '\n\t'.join(misspelled_words) + '\n---') else: print('no spelling mistakes') return (no_unexpected_output and no_broken_links and no_spelling_mistakes)
class Search: ranks = {} keyweights = {} hash_table = {} titles = {} d = {} def __init__(self): f = open(str(os.getcwd()) + "/Google_IITB/data/keyweights.db", "rb") self.keyweights = pickle.load(f) f.close() f = open(str(os.getcwd()) + "/Google_IITB/data/data.db", "rb") self.hash_table = pickle.load(f) f.close() with open(str(os.getcwd()) + "/Google_IITB/data/pageranks.db","rb") as fp: self.ranks = pickle.load(fp) with open(str(os.getcwd()) + "/Google_IITB/data/titles.db","rb") as fp: self.titles = pickle.load(fp) self.d = DictWithPWL("en_US", str(os.getcwd()) + "/Google_IITB/data/allkeys.txt") def swap(self, listOfUrls, i, j): tmp = listOfUrls[i] listOfUrls[i] = listOfUrls[i-1] listOfUrls[i-1] = tmp return def hashFunc(self,key): hashout = 0 for i in range(len(key)): hashout = hashout + ord(key[i]) return hashout def findinKeyTable(self, key, Table): hashkey = self.hashFunc(key) if hashkey in Table: for i in range(len(Table[hashkey])): if Table[hashkey][i][1] == key: return i return False return False def spellCheck(self, word): if self.d.check(word) == True: return True else: suggest = self.d.suggest(word) for i in range(len(suggest)): suggest[i] = suggest[i].lower() #keyweights = getKeyWeights() bestweight = 0 bestword = suggest[0] for entry in suggest: hashkey = self.hashFunc(entry) secKey = self.findinKeyTable(entry, self.keyweights) if secKey != False: if self.keyweights[hashkey][secKey][0] >= bestweight: bestword = self.keyweights[hashkey][secKey][1] bestweight = self.keyweights[hashkey][secKey][0] return bestword return False def ngrams(self, word): Ngrams = [] for i in range(3,len(word)+1): Ngrams.append(word[ : i]) return Ngrams def exactQuery(self, entry): return entry.split() def Query(self, entry): words = entry.split() #words has to be returned somehow searchlist = [] for i in range(len(words)): searchlist = searchlist + self.ngrams(words[i].lower()) #print searchlist return searchlist def Sort(self, listOfUrls): loc_ranks = [] for url in listOfUrls: hashkey = self.hashFunc(url) for i in range(len(self.ranks[hashkey])): if self.ranks[hashkey][i][0] == url: loc_ranks.append(self.ranks[hashkey][i][1]) for i in range(1,len(listOfUrls)): if loc_ranks[i] > loc_ranks[i-1]: self.swap(listOfUrls, i, i-1) return listOfUrls def removeRepeats(self, result): for i in range(len(result)): for j in range(i): if result[i] == result[j]: result[i] = 0 resultFinal = filter(lambda a: a != 0, result) return resultFinal def primarySort(self, result, matches): for i in range(1,len(result)): if matches[i] > matches[i-1]: self.swap(result, i, i-1) resultSort = self.removeRepeats(result) return resultSort def findin(self, key, query, table): for i in range(len(table[key])): if table[key][i][0] == query: return i return -1 def numberOfMatches(self, url, result): Count = 0 for link in result: if url == link: Count = Count + 1 return Count def search(self, query): result = [] searchlist = self.Query(query) for i in range(len(searchlist)): key = self.hashFunc(searchlist[i]) secKey = self.findin(key, searchlist[i], self.hash_table) if secKey == -1: return [] temp = self.hash_table[key][secKey][1 :] for j in range(len(temp)): #if Find(result, temp[j]) == 0: result.append(temp[j]) matches = [] #before sorting, first sort according to no. of matches for link in result: matches.append(self.numberOfMatches(link, result)) result = self.Sort(result) result = self.primarySort(result, matches) final = [[] for i in range(len(result))] for i in range(len(result)): try: if self.titles[result[i]] == '': final[i] = [result[i], result[i]] else: final[i] = [result[i], self.titles[result[i]]] except: final[i] = [result[i], result[i]] return final def searchWSC(self, query): change = False searchlist = query.split() for i in range(len(searchlist)): bestword = self.spellCheck(searchlist[i]) if bestword != True: searchlist[i] = bestword change = True changedEntry = "" result = self.search(query) if change: for i,word in enumerate(searchlist): if i is not (len(searchlist)-1): changedEntry = changedEntry + word + "+" else: changedEntry = changedEntry + word return {'change' : change,'query' : changedEntry , 'search' : result}
def add_dict(enchant_dict, filename): with open(filename) as f: lines = f.read().splitlines() # You better not have more than 1 word in a line for wrd in lines: if not enchant_dict.check(wrd): enchant_dict.add_to_pwl(wrd) if __name__ == '__main__': args = parse_args() # print(args) sitk_dict = DictWithPWL('en_US', 'additional_dictionary.txt') if args.dict is not None: for d in args.dict: add_dict(sitk_dict, d) spell_checker = SpellChecker(sitk_dict, filters=[EmailFilter, URLFilter]) output_lvl = 1 if args.brief: output_lvl = 0 else: if args.verbose: output_lvl = 2 if args.miss: output_lvl = -1
#coding UTF-8 import sys import re import string import enchant import time import multiprocessing as mp from functools import partial from enchant import DictWithPWL from enchant.checker import * #import python_ginger_api as ginger import language_check import codecs cleaned_text = "" d = DictWithPWL("en_US","dict") #chkr = SpellChecker("en_US") chkr = SpellChecker(d) tool = language_check.LanguageTool('en-US') def eval_sentence(sentence,g="",cs=""): text = "" tot_sp = 0 tot_gm = 0 if g == "-g": if len(sentence) < 200: matches = tool.check(sentence) if len(matches) > 0: with open(cleaned_text+ ".gm_log","a+") as log_f: for match in matches: if str(match.ruleId) != "WHITESPACE_RULE":