def __init__( self, word_list=None, pwl_loc="/data/pwl", ): self.dict = None filepath = os.getcwd() + pwl_loc print filepath # load the dictionary for spell correction try: print("loading pwl") self.dict = request_pwl_dict(filepath) if not self.dict.check("sigmaaldrich"): print("going to build") raise IOError print("loaded pwl") except IOError: print("building pwl") f = open(filepath, "w+") f.write("SAMUELHELMSISAWESOME" + "\n") f.close() self.load_words(word_list, filepath) self.dict = request_pwl_dict(filepath)
def test_UnicodeFN(tmp_path): """Test that unicode PWL filenames are accepted.""" unicode_path = tmp_path / "테스트" setPWLContents(unicode_path, ["Lozz"]) d = request_pwl_dict(str(unicode_path)) assert d.check("Lozz") assert d
def __init__(self, lang, base_keymap, full_dictionary_file=None): self.__lang = lang self.__base_keymap = base_keymap self.__keymap = self.__generate_full_keymap(base_keymap) self.__full_dictionary_file = full_dictionary_file if full_dictionary_file: self.__dict = enchant.request_pwl_dict(full_dictionary_file)
def test_checkline(self): checker = SpellChecker("en_GB", filters=[URLFilter, EmailFilter]) pwl = enchant.request_pwl_dict('dict.txt') with open('test.txt', 'w+') as tfile: self.assertEqual(checkline('Lots of words that are spelt orrectly!', 'filename.txt', False, checker, pwl, tfile, 0), 1)
def test_check(pwl_path): """Test that basic checking works for PWLs.""" setPWLContents(pwl_path, ["Sazz", "Lozz"]) d = request_pwl_dict(str(pwl_path)) assert d.check("Sazz") assert d.check("Lozz") assert not d.check("hello")
def generate_personal_word_list(self, minimal_appear_count=100): """ Generate enchant personal word list (pwl) from all comments if a word is not in english but appears more than 100 times in all comments :return: """ self.dictionary = defaultdict(int) d = enchant.Dict('en_US') for doc in self.corpus: for token in doc: self.dictionary[token] += 1 word_list = [item for item in self.dictionary.iteritems() if not d.check(item[0])] word_list = [r for r in sorted(word_list, key=lambda item: item[1], reverse=True) if r[1] > minimal_appear_count] dict_pwl = enchant.request_pwl_dict(self.pwl) added = 0 for word in word_list: w = str(word[0]) if not dict_pwl.check(w): dict_pwl.add(w) added += 1 print(w) log.info("{} words not in English but appears in comments more than {} times.".format(len(word_list), minimal_appear_count)) log.info("Of those {} words, {} has already exists in pwl, the rest {} ones has been added to pwl".format( len(word_list), len(word_list) - added, added))
def __init__(self, lang, base_keymap, full_dictionary_file=None): self.__lang = lang self.__base_keymap = base_keymap self.__keymap= self.__generate_full_keymap(base_keymap) self.__full_dictionary_file = full_dictionary_file if full_dictionary_file: self.__dict = enchant.request_pwl_dict(full_dictionary_file)
def artistDict(aVocab): ##Creates enchant dictionary for list of words ##enchant.request_pwl_dict() technically tries to open a file ##Need to create function that just creates the dict alone dArtist = enchant.request_pwl_dict("_") for sWord in aVocab: dArtist.add(sWord) return dArtist
def setUp(self): """Create a shared markspell instance to use for testing""" logging.basicConfig(level=logging.CRITICAL, format='%(levelname)6s: %(message)s') self.logger = logging.getLogger('markdown-spellchecker') pwl = enchant.request_pwl_dict( join(dirname(realpath(__file__)), 'dict.txt')) self.markspell = MarkSpelling(pwl)
def spell_check( string ): """ returns a list of spell-checked words matching the given string. The first element in the list is best match """ relative_path = os.path.realpath( os.path.dirname( __file__ ) ) districts_path = os.path.join( relative_path, "districts_services.txt" ) dictionary = enchant.request_pwl_dict( districts_path ) matches = dictionary.suggest( str( string ).lower() ) return matches
def test_add(pwl_path): """Test that adding words to a PWL works correctly.""" d = request_pwl_dict(str(pwl_path)) assert not d.check("Flagen") d.add("Esquilax") d.add("Esquilam") assert d.check("Esquilax") assert "Esquilax" in getPWLContents(pwl_path) assert d.is_added("Esquilax")
def correctword(words): # call build-in dictionary and self-added dictionary pwl = enchant.request_pwl_dict( "/Users/lxy/PycharmProjects/data mining/enwiktionary.txt") d_gb = enchant.Dict("en_GB") d_g = enchant.DictWithPWL( "grc_GR", "/Users/lxy/PycharmProjects/data mining/enwiktionary.txt") return [word for word in words if d_gb.check(word) or d_g.check(word)]
def test_suggestions(pwl_path): """Test getting suggestions from a PWL.""" setPWLContents(pwl_path, ["Sazz", "Lozz"]) d = request_pwl_dict(str(pwl_path)) assert "Sazz" in d.suggest("Saz") assert "Lozz" in d.suggest("laz") assert "Sazz" in d.suggest("laz") d.add("Flagen") assert "Flagen" in d.suggest("Flags") assert "sazz" not in d.suggest("Flags")
def loadpwl(filename): logger = logging.getLogger('markdown-spellchecker') if os.path.exists(filename): logger.debug('PWL file found') pwl = enchant.request_pwl_dict(filename) logger.debug('PWL file loaded') return pwl else: logger.error('PWL file "%s" does not exist', filename) sys.exit(1) return None
def handle_command(command, channel): """ Executes bot command if the command is known """ # Default response is help text for the user default_response = "Not sure what you mean. Try *{}* or *{} (city)*.".format( ECHO_COMMAND, WEATHER_COMMAND) # Finds and executes the given command, filling in response response = None if command.startswith(ECHO_COMMAND): if len(ECHO_COMMAND) == len(command): response = "Sure...I need some text to do that!" else: response = command[command.index(ECHO_COMMAND) + len(ECHO_COMMAND) + 1:] elif command.startswith(WEATHER_COMMAND): if len(WEATHER_COMMAND) == len(command): response = "Sure...I need a city to do that!" else: dictionary = enchant.request_pwl_dict("cities.txt") suggestion = dictionary.suggest( command[command.index(WEATHER_COMMAND) + len(WEATHER_COMMAND) + 1:]) requestget = requests.get( 'http://api.openweathermap.org/data/2.5/weather?q=' + suggestion[0].replace(" ", "%20") + '&units=metric&appid=bbb393e2a17ca6ff2a90939e14b836e2') if requestget.status_code == 200: responsedata = requestget.json() response = 'Today\'s weather for: ' + responsedata['name'] + ', ' + responsedata[ 'sys']['country'] + '\nDescription: ' + responsedata[ 'weather'][0]['main'] + ', ' + responsedata['weather'][0][ 'description'] + '\nTemperature in Celsius: ' + "{0:.2f}".format( responsedata['main']['temp'] ) + '\nMinimum Temperature in Celsius: ' + "{0:.2f}".format( responsedata['main']['temp_min'] ) + '\nMaximum Temperature in Celsius: ' + "{0:.2f}".format( responsedata['main']['temp_max'] ) + '\nHumidity: ' + str( responsedata['main']['humidity'] ) + '%\nWind: ' + "{0:.2f}".format( responsedata['wind']['speed']) + ' meters/sec' else: response = "Unfortunately...I do not recognize the city" # Sends the response back to the channel slack_client.api_call("chat.postMessage", channel=channel, text=response or default_response)
def f2s(line): rep = {"'":"","I ": "you ", "am ": "are ", "my": "your", "we ": "they", " us": "you ","Because": "", "because": ""," me ": " you ","im":"you're","I'm":"you're", "and":"","but":"","for":"","if":"","or":"","when":"","My":"your"} #Replace substrings in line with those in rep rep = dict((re.escape(k), v) for k, v in rep.iteritems()) pattern = re.compile("|".join(rep.keys())) b = pattern.sub(lambda m: rep[re.escape(m.group(0))], line) b = random.choice(tokenize.sent_tokenize(b)) #Split the sentence if it has a comma or because if "," in b: split = b.split("because") split = b.split(",") #Spell check pwl = enchant.request_pwl_dict("mywords.txt") d2 = enchant.DictWithPWL("en_US","mywords.txt") #If there is a comma, choose the side with the most amount of words to pass to the chat function. len_choice = max(split,key=len) #Replace the first space if there is a comma. if len_choice[:0] == "": len_choice = len_choice.replace(" ","",1) print d2.check(b) print len_choice return len_choice.lower() else: # Spell check pwl = enchant.request_pwl_dict("mywords.txt") d2 = enchant.DictWithPWL("en_US", "mywords.txt") # If there is a comma, choose the side with the most amount of words to pass to the chat function. return b.lower()
def spellchecker(is_interactive_deploy=True): """Spellcheck the Markdown and ReST files on the site""" spelling_errors_found = False # aspell is available on mac by default, and I don't want to manage custom # word lists for both aspell and myspell so we'll just use aspell enchant._broker.set_ordering("en_GB", "aspell") pwl_dictionary = enchant.request_pwl_dict(SPELLCHECK_EXCEPTIONS) en_spellchecker = enchant.checker.SpellChecker( "en_GB", filters=[enchant.tokenize.EmailFilter, enchant.tokenize.URLFilter] ) md_posts = glob.glob(os.path.join(SITE_BASE, "posts", "*.md")) md_pages = glob.glob(os.path.join(SITE_BASE, "stories", "*.md")) for file_to_check in md_pages + md_posts: with open(file_to_check, "r", encoding="utf-8") as f: lines = f.readlines() e = _get_spellcheck_exceptions(lines) list(map(pwl_dictionary.add_to_session, e)) for line in _non_directive_lines(lines): en_spellchecker.set_text(strip_markdown_directives(line)) for err in en_spellchecker: if not pwl_dictionary.check(err.word): spelling_errors_found = True spelling_error = "Not in dictionary: %s (file: %s " "line: %s). Suggestions: %s" % ( err.word, os.path.basename(file_to_check), lines.index(line) + 1, ", ".join(en_spellchecker.suggest(err.word)), ) print(spelling_error) if is_interactive_deploy: action = prompt( "Add '%s' to dictionary [add] or " "replace [type replacement]?" % (err.word,), default="add", ).strip() if action == "add": _add_to_spellcheck_exceptions(file_to_check, err.word) pwl_dictionary.add(err.word) else: _replace_in_file(file_to_check, err.word, action) else: _send_pushover_summary(spelling_error, "Spelling error: %s" % (err.word,)) return spelling_errors_found
def spellCorrect(str_input): str_input = removeRedundantWhiteSpaces(str_input) str_input = toLowerCase(str_input) str_input = removePunctuations(str_input).split(" ") out_array = [] for word in str_input: d = enchant.request_pwl_dict("words.txt") isword = d.check(word) if (isword): out_array.append(word) else: word_list = d.suggest(word) out = suggestions(word_list, word) out_array.append(out) output = ' '.join(out_array) return output
def detectErrors(self, string): ''' @brief get all errors given a string @return False Doesn't have errors. Dict() Dict With the errors ''' spellChecker = SpellChecker('es', filters=[ EmailFilter, URLFilter, HtmlEntitiesFilter, sprintfParametersFilter ]) spellChecker.set_text(string) PWL_es = enchant.request_pwl_dict(self.es_PWL_path) errors = False for err in spellChecker: # Verify if the word is ok on English # Several words are on English so they are marked as error if (self.dict_en_WP_PWL.check(err.word) is True): continue # Verify if the word is ok on Es PWL if (PWL_es.check(err.word) is True): continue # convert only once errors to a dictionary if type(errors) is not dict(): errors = {} errors['errorWord'] = list() errors['errorWord'].append(err.word) # Add the bad word to the list self.addNewBadWord(err.word) # value_is_true if condition else value_is_false # "fat" if is_fat else "not fat" return errors
def ccchecker(dictionary): try: #the goal is to make sure all the course codes are correct pattern=re.compile(r'\d\w+')#pattern to get the the course codes pwl=enchant.request_pwl_dict("/Users/damola/Desktop/ ")#coursecodes spellcheck for i in dictionary:#iterates over the dictionary count=0 for j in dictionary[i]:#iterates over the dictionary values for the key i if re.search(pattern,j):#checks if the search is valid then allows the code below to run pin=re.search(pattern,j)#assigns the variable pin to the coursecode if len(pin.group())>=4:#checks if it has the appropriate amount of characters dictionary[i][count]=j.replace(j[pin.span()[0]:pin.span()[1]],pwl.suggest(pin.group())[0])#replaces the coursecode with a corrected version count=count+1 return dictionary except: print 'CCCHECKER did not work' return dictionary
def timecheck(dictionary):#this checks the times try: pattern=re.compile(r'(\S+)([AP]M)')#pattern to find the first time in the textblock from the user(the times are gotten in the following format ('11:30','AM')) pattern1=re.compile(r'(\w+):(\w+)')#pattern to group the time codes from the textblock into for example (11,30) pwl=enchant.request_pwl_dict(os.path.expanduser("~/Desktop/timecodes.txt"))#loads the timecode spellcheck for i in dictionary:#iterates over the dictionary count=0 for j in dictionary[i]:#iterates over the dictionary values for the key i if re.search(pattern,j):#checks if the search is valid then allows the code below to run length=re.findall(pattern,j)#finds all the times in the textblock if len(length)==1:#if the AM or PM are not spelt correctly this corrects it AMPM=length[0][1]#AM or PM listoftimes=re.findall(pattern1,j)#group the time codes from the textblock into for example (11,30) listoftimes1=listoftimes[0][1] print listoftimes1 listoftimes2=listoftimes[1][1] time1=pwl.suggest(listoftimes1) print time1 time2=pwl.suggest(listoftimes2) if len(time1)==2: for k in time1: if AMPM in k: index=time1.index(k) pin=re.search(pattern1,j) dictionary[i][count]=j.replace(j[pin.span()[0]:pin.span()[1]],'%s:%s'%(re.search(pattern1,j).groups()[0],pwl.suggest(re.search(pattern1,j).groups()[1])[index])) elif len(time2)==2: for k in time1: if AMPM in k: index=time1.index(k) pin=re.search(pattern1,j) dictionary[i][count]=j.replace(j[pin.span()[0]:pin.span()[1]],'%s:%s'%(re.findall(pattern,j)[1][0],pwl.suggest(re.findall(pattern,j)[1][1])[index])) count=count+1 return dictionary except: print 'timecheck did not work' return dictionary
def check_spelling(meme): """ Spell checks the meme in the message and see if it matches any of the available memes in our meme list. Return: If correct spelling is found or everything is spelled correctly, return True and word. Else, return False """ word_list = enchant.request_pwl_dict("meme_list") meme_dict = enchant.DictWithPWL("en_US", "meme_list") suggestions = meme_dict.suggest(meme) max_ratio = 0.0 highest_word = "" for suggestion in suggestions: temp_ratio = similarity(meme, suggestion) if temp_ratio > max_ratio: highest_word = suggestion return (highest_word)
def _create_checker(providers, langtag, words): try: import enchant except ImportError: pkgs = ["python-enchant"] raise PologyError( _("@info", "Python wrapper for Enchant not found, " "please install it (possible package names: " "%(pkglist)s).", pkglist=format_item_list(pkgs))) if langtag is not None: try: broker = enchant.Broker() if providers is not None: broker.set_ordering(langtag, providers) checker = broker.request_dict(langtag) checker.check(".") except: checker = None else: tmpf = tempfile.NamedTemporaryFile() tmpf.close() checker = enchant.request_pwl_dict(tmpf.name) os.unlink(tmpf.name) if checker: pname = checker.provider.name.split()[0].lower() need_upcasing = (pname in ("personal", "myspell")) for word in words or []: checker.add_to_session(word) if need_upcasing: checker.add_to_session(word[0].upper() + word[1:]) checker.add_to_session(word.upper()) return checker
def suggester(word, dct): dictio = enchant.request_pwl_dict(f"/app/bot/dictionaries/{dct}.txt") if dictio.check(word): return True else: return dictio.suggest(word)
import lzma import pickle import curses import random import enchant from index import index from doubleentries import doubleentries with lzma.open('webster.txt.xz', 'rt') as infile: webster = infile.read() wordlist = enchant.request_pwl_dict('wordlist.txt') # initialize terminal screen screen = curses.initscr() curses.noecho() curses.cbreak() screen.keypad(True) screen.refresh() def display(textchunk): screen.clear() height = screen.getmaxyx()[0] command = 0 offset = 0 screen.scrollok(True) while chr(command) not in 'Qq': screen.clear() dispStr = "" lineCounter = 0 for char in textchunk: if char == "\n": lineCounter += 1
def __init__(self, dict_name='en_US', max_dist=2): if dict_name == 'en_US': self.spell_dict = enchant.Dict(dict_name) else: self.spell_dict = enchant.request_pwl_dict(dict_name) self.max_dist = 2
from gluon.html import * import re, enchant DIGITS = re.compile(r'\d') INFRANK = re.compile(r'(\s+(ssp|subsp|var|forma|f)([.]?)(?=\s+))', re.I) NAMES = 'applications/phylografter/static/names.txt' d = enchant.request_pwl_dict(NAMES) def check(s): try: return d.check(s) except: print 'spellcheck.check error', s return None def suggest(s): return d.suggest(s) def process_label(db, otu): options = [] s = otu.label.replace('_', ' ') if check(s): options = list(db(db.ott_name.name==s).select()) return (True, options) v = suggest(s) if not v: words = s.replace('.',' ').split() if words[-1].lower() == 'sp': words = words[:-1] s = DIGITS.sub('', ' '.join(words)) if not s: return (False, options) if check(s): options = list(db(db.ott_name.name==s).select())
def load_custom_wordlist(self, wordlist_file): self.enchant = enchant.request_pwl_dict(wordlist_file)
#control TOR and establish new identity, assigns user new IP def newIdentity(): socks.setdefaultproxy() s= socket.socket(socket.AF_INET, socket.SOCK_STREAM) s.connect(("127.0.0.1", 9051)) s.send("AUTHENTICATE\r\n") response= s.recv(128) if response.startswith("250"): s.send("SIGNAL NEWNYM\r\n") s.close() connectTor() flagged_queries= set() flag_wan_dom= set() pwl = enchant.request_pwl_dict("enchantAddList.txt") from bulkwhois.shadowserver import BulkWhoisShadowserver #retreived ASN rankings by number of ASes in system from http://as-rank.caida.org/ def getASNRankings(): totalNumASes= 44086 asn_rank_dic= {} for line in open("ASNRankingsByAS.txt", "rU"): line.strip() line= line.split(",") rank= line[0] rank= int(rank) asn= line[1] num_ASes= line[2]
def __init__(self, request, pwldir=None): SpellChecker.__init__(self, request, pwldir) if self._haspwl(): self.__pwldic = enchant.request_pwl_dict(self._pwlfilename())
def ffeatures(domain): domain = cut_extend(domain) ts_bigrams = pandas.read_csv("all_bigrams.csv", converters={i: str for i in range(54872)}) ts_trigrams = pandas.read_csv("all_trigrams.csv", converters={i: str for i in range(54872)}) ds_bigrams = pandas.read_csv("bigrams_sorted.csv", converters={i: str for i in range(54872)}) ds_trigrams = pandas.read_csv("trigrams_sorted.csv", converters={i: str for i in range(54872)}) N = 1444 M = 54872 hexchars = "0123456789abcdefABCDEF" frequent = { "a": 9.35, "b": 2.27, "c": 3.87, "d": 3.26, "e": 9.69, "f": 1.67, "g": 2.4, "h": 2.56, "i": 7.4, "j": 0.55, "k": 1.9, "l": 4.65, "m": 3.37, "n": 6.12, "o": 7.28, "p": 2.91, "q": 0.21, "r": 6.44, "s": 6.48, "t": 6.13, "u": 3.23, "v": 1.37, "w": 1.2, "x": 0.67, "y": 1.67, "z": 0.68, "0": 0.18, "1": 0.24, "2": 0.23, "3": 0.15, "4": 0.16, "5": 0.1, "6": 0.09, "7": 0.09, "8": 0.1, "9": 0.08, ".": 0, "-": 1.26 } domain = str(domain) domain = domain.lower() found_bi = [] num_found_bi = [] found_tri = [] num_found_tri = [] # find f1 for j in range(len(domain) - 1): bi = domain[j:j + 2] if (bi not in found_bi): res = ds_bigrams[ds_bigrams["Bigrams"] == bi] if (res.empty == False): found_bi.append(bi) num_found_bi.append(1) else: pos = found_bi.index(bi) num_found_bi[pos] = num_found_bi[pos] + 1 f1 = len(found_bi) # find f9 for j in range(len(domain) - 2): tri = domain[j:j + 3] if (tri not in found_tri): res = ds_trigrams[ds_trigrams["Trigrams"] == tri] if (res.empty == False): found_tri.append(tri) num_found_tri.append(1) else: pos = found_tri.index(tri) num_found_tri[pos] = num_found_tri[pos] + 1 f9 = len(found_tri) # find f2 f2 = 0 for i in range(f1): index = ts_bigrams.index[ts_bigrams["Bigrams"] == found_bi[i]][0] + 1 f2 = f2 + (num_found_bi[i] * index) # find f10 f10 = 0 for i in range(f9): index = ts_trigrams.index[ts_trigrams["Trigrams"] == found_tri[i]][0] + 1 f10 = f10 + (num_found_tri[i] * index) #find f3 f3 = 0 for i in range(f1): vt = ds_bigrams.index[ds_bigrams["Bigrams"] == found_bi[i]][0] + 1 f3 = f3 + (num_found_bi[i] * vt) f3 = f3 / f1 #find f11 f11 = 0 for i in range(f9): vt = ds_trigrams.index[ds_trigrams["Trigrams"] == found_tri[i]][0] + 1 f11 = f11 + (num_found_tri[i] * vt) f11 = f11 / f9 #find f4 f4 = f2 / len(domain) #find f12 f12 = f10 / len(domain) #find f5 f5 = f3 / len(domain) #find f13 f13 = f11 / len(domain) #find f6 f6 = f1 / len(domain) #find f14 f14 = f9 / len(domain) #find f7 f7 = 0 for i in range(f1): f7 = f7 + num_found_bi[i] f7 = f7 / len(domain) #find f15 f15 = 0 for i in range(f9): f15 = f15 + num_found_tri[i] f15 = f15 / len(domain) #find f8 f8 = 0 for i in range(f1): vt = ds_bigrams.index[ds_bigrams["Bigrams"] == found_bi[i]][0] + 1 f8 = f8 + ((vt / N) * math.log10(vt / N)) f8 = -f8 #find f16 f16 = 0 for i in range(f9): vt = ds_trigrams.index[ds_trigrams["Trigrams"] == found_tri[i]][0] + 1 f16 = f16 + ((vt / M) * math.log10(vt / M)) f16 = -f16 #find f17 f17 = 0 vowels = ['a', 'e', 'i', 'o', 'u'] for i in range(len(domain)): if (domain[i] in vowels): f17 = f17 + 1 #find f18 f18 = f17 / len(domain) #find f19 f19 = 0 if (len(domain) == 1): f19 = 1 else: for i in range(len(domain)): if (domain[i] not in hexchars): if (i != 0): f19 = 1 break #find f20 unq_chars = ''.join(set(domain)) f20 = 0 for char in unq_chars: f20 = f20 + (domain.count(char) * frequent[char]) f20 = f20 / len(domain) #find f21 # d = enchant.Dict("en_US") d = enchant.request_pwl_dict("wordlist.txt") kq = [ ''.join(_ngram) for _ngram in everygrams(domain) if d.check(''.join(_ngram)) and len(_ngram) > 1 ] f21 = len(kq) #find f22 if (len(domain) < 5): f22 = 1 else: f22 = 0 dfObj = pandas.DataFrame(columns=[ 'f1', 'f2', 'f3', 'f4', 'f5', 'f6', 'f7', 'f8', 'f9', 'f10', 'f11', 'f12', 'f13', 'f14', 'f15', 'f16', 'f17', 'f18', 'f19', 'f20', 'f21', 'f22' ]) dfObj.loc[0] = [ f1, f2, f3, f4, f5, f6, f7, f8, f9, f10, f11, f12, f13, f14, f15, f16, f17, f18, f19, f20, f21, f22 ] return dfObj
def setUp(self): """Create a shared markspell instance to use for testing""" logging.basicConfig(level=logging.CRITICAL, format='%(levelname)6s: %(message)s') self.logger = logging.getLogger('markdown-spellchecker') pwl = enchant.request_pwl_dict(join(dirname(realpath(__file__)), 'dict.txt')) self.markspell = MarkSpelling(pwl)
n = len(words) for word in words: if word: if word in all_words_list: all_words_list[word] = all_words_list[word] + 1 else: all_words_list[word] = 1 for word in all_words_list: idf_list[word] = idf(word, corpus) m = len(all_words_list) # print all_words_list all_words_list = sortDict(all_words_list) # newlist = sorted(newlist.items(), key=operator.itemgetter(1), reverse=True) # all_words_list = sorted(all_words_list.iterkeys()) # print all_words_list pwl = enchant.request_pwl_dict("big.txt") print('enter query') a = raw_input() a = queryCorrector(a, pwl) print type(a) print a x = getRelativeDocs(a, all_words_list, idf_list) # x = sorted(x.items(), key=operator.itemgetter(1), reverse=True)[:30] # newList = {} # for i in range(30): # print str(x[i][0])+' '+str(x[i][1]) # newList[x[i][0]] = popularity[x[i][0]] # newList = sorted(newList.items(), key=operator.itemgetter(1), reverse=True) # for i in range(30): # flag = True # arr = a.split()
#GLOBAL VARIABLES tm = time.time() # Global list of tweets: every element is a dictionary archive_list = [] #List of conversations and user in conversations flames or not flames = [] noflames = [] flamesU = [] noflamesU = [] # Add as dictionary the English language plus the bad words pwl = enchant.request_pwl_dict('dicts/bad-words.txt') Dict = enchant.DictWithPWL('en_US', 'dicts/bad-words.txt') slg = mount_slang_dict() # Measurments used in feature evaluation fla_upp = 0.0 nfla_upp = 0.0 tmp_upp = 0.0 fla_mrk = 0.0 nfla_mrk = 0.0 tmp_mrk = 0.0 fla_gsm = 0.0 nfla_gsm = 0.0 tmp_gsm = 0.0 fla_bsm = 0.0 nfla_bsm = 0.0
unicode_chars = (chr(i) for i in range(sys.maxunicode)) control_unicode_chars = ''.join(c for c in unicode_chars if unicodedata.category(c) == 'Cc') control_char_re = re.compile('[%s]' % re.escape(control_unicode_chars)) def remove_control_chars(s): return control_char_re.sub('', s) # ====================================================================================== # # Delongation us_dict = enchant.Dict("en_US") uk_dict = enchant.Dict("en_GB") au_dict = enchant.Dict("en_AU") twitter_dict = enchant.request_pwl_dict( os.path.join(os.path.dirname(__file__), 'assets', 'twitter_jargon.txt')) def is_known_word(word): return us_dict.check(word) \ or uk_dict.check(word) \ or au_dict.check(word) \ or twitter_dict.check(word) delongate_pattern = re.compile(r"(.)\1{2,}") def delongate(text): return delongate_pattern.sub(r"\1\1", text)
("israel", "Israel"), ) # List of dictionary objects to test dicts = [] # Number of correct words missed by each dictionary missed = [] # Number of corrections not suggested by each dictionary incorrect = [] # Number of places to find correct suggestion, or -1 if not found dists = [] # Create each dictionary object for prov in providers: if prov == "pypwl": d = enchant.request_pwl_dict(wordsfile) else: b = enchant.Broker() b.set_ordering(lang, prov) d = b.request_dict(lang) if not d.provider.name == prov: raise RuntimeError("Provider '%s' has no dictionary for '%s'" % (prov, lang)) del b dicts.append(d) missed.append([]) incorrect.append([]) dists.append([]) # Actually run the tests testcases = open(datafile, "r")
CONFIGFILE.read(CONFIGFILECOMPLETEPATH) CONFIGFILE.read(DIRECTORY_TESTS, 'config.ini') DEFAULTCONFIGFILE = CONFIGFILE['DEFAULT'] DIRECTORY_ROOT = os.path.dirname(DIRECTORY_TESTS) FILENAME_JSONSCORE = DEFAULTCONFIGFILE['Prevscore'] FILENAME_PWL = DEFAULTCONFIGFILE['PWL'] if not os.path.isabs(FILENAME_JSONSCORE): FILENAME_JSONSCORE = os.path.join( DIRECTORY_TESTS, DEFAULTCONFIGFILE['Prevscore']) if not os.path.isabs(FILENAME_PWL): FILENAME_PWL = os.path.join(DIRECTORY_TESTS, DEFAULTCONFIGFILE['PWL']) #print() if os.path.exists(FILENAME_PWL): print("\033[1;36mPWL file exists\033[0m") pwl = enchant.request_pwl_dict(FILENAME_PWL) #print("Loaded PWL object: %s" % pwl) #print("Methods of object: %s" % dir(pwl)) else: print("\033[1;36mPWL file does not exist\033[0m") sys.exit(2) # add words to the dictionary used to test for spelling errors spellcheck = SpellChecker("en_GB", filters=[URLFilter, EmailFilter]) wordswrong = open(CONFIGFILE['DEFAULT']['Wordswrongfile'], "w+") # creates/opens a file to save the words that were spelt wrong filecheck = open(CONFIGFILE['DEFAULT']['Filecheck'], "w+") # creates/opens a file to save the files that were checked def main(): parser = argparse.ArgumentParser(
import enchant f = open("customWords.txt",'w') f.close() pwl = enchant.request_pwl_dict("customWords.txt") d = enchant.Dict("en_US") singleLetters = ['a','i'] def validateLeft(variable): return d.check(variable) or pwl.check(variable) def compute(variable, parent): if variable == '': return True if len(variable) == 1: return [] tempRes = [] for i in reversed(range(1,len(variable)+1)): resLeft = validateLeft(variable[:i]) if resLeft: resRight = compute(variable[i:],variable[:i]) if(resRight == True and i!=1): tempRes.append([[variable[:i].lower()]]) # Don't add single letter entries elif(len(variable[:i]) == 1 and variable[:i].lower() not in singleLetters): pass # If single letter entries are not 'i' or 'a', move on else: # Recursively check for all words in right part of entries for y in resRight: innerRes = [variable[:i]] for entry in y: innerRes+=entry tempRes.append([innerRes]) return tempRes
import os import enchant from enchant.checker import SpellChecker #start using python -i enchant_console.py # combine with the US dictionary #pdict = enchant.DictWithPWL("en_US","wordlist.txt") pdict = enchant.request_pwl_dict("wordlist.txt") chkr = SpellChecker(pdict) def SuggestSpelling(theword): return chkr.suggest(theword) def CheckText(thetext): chkr.set_text(thetext) rsp = [] for err in chkr: rsp.append(err.word) return rsp ''' chkr.set_text("pelvi frecture") for err in chkr: print "ERROR:", err.word '''
# count word frequency freqmap = {} for w in arr_word: if w not in freqmap: freqmap[w] = 0 freqmap[w] += 1 # print freqmap # build dict file dict_words = [w for w in freqmap if freqmap[w] >= DICT_THRESHOLD] # print 'Constructing dict..' # corpus_dict = enchant.Dict() corpus_dict = enchant.request_pwl_dict(TMP_PREFIX + docid) for w in dict_words: corpus_dict.add_to_pwl(w) # print corpus_dict # print 'Done constructing dict.' # Data munging last_varid = arr_varid[0] last_candid = arr_candid[0] last_source = arr_source[0] data = [] thisvar = [] thiscand = [] # GENERATE DATA with one pass
return_list.append((classes[r[0]], r[1])) # return tuple of intent and probability return return_list def output_message(intents, sentence, model, userID='123', show_details=False): results = classify(intents, sentence, model) # if we have a classification then find the matching intent tag if results: # loop as long as there are matches to process while results: for i in intents['intents']: # find a tag matching the first result if i['tag'] == results[0][0]: # a random response from the intent return print("Dr. RK -> ",random.choice(i['responses'])) results.pop(0) #d = enchant.Dict("en_GB") d = enchant.request_pwl_dict("english_dict.txt") model = trained_model(intents) while True: sentence = input("") if sentence.lower().strip() == "bye": break else: output = output_message(intents, sentence, model)
"ms": ["ms_MY"], "sge": [], "zh": [] # "sge" and "zh" handled with personal word lists below } # --- Corresponding dictionaries --- spelling_dictionaries = {} for language in spelling_languages.keys(): spelling_dictionaries[language] = {} for variant in spelling_languages[language]: spelling_dictionaries[language][variant] = enchant.Dict(variant) # --- SgE word lists --- spelling_dictionaries["sge"] = {} sge_lists = sge_words + sge_chinese_derived_words + sge_malay_derived_words for wordlist in sge_lists: spelling_dictionaries["sge"][wordlist] = enchant.request_pwl_dict(wordlist) # --- Additional word list handling --- # Count Chinese-derived words in SgE as Chinese for wordlist in sge_chinese_derived_words: spelling_dictionaries["zh"][wordlist] = enchant.request_pwl_dict(wordlist) for wordlist in sge_malay_derived_words: spelling_dictionaries["ms"][wordlist] = enchant.request_pwl_dict(wordlist) def extract_features(sentence): tokenised = tokenise(sentence) tokenised_spellcheck = prep_tokens_for_spellcheck(tokenised) features = {} ## Primary features # Chinese features["has_zh_chars"] = has_zh_chars(sentence) features["has_pinyin"] = has_pinyin(tokenised_spellcheck)
def parse_options(): parser = OptionParser() usage = 'usage: brxor.py [options] <file>' parser = OptionParser(usage=usage) parser.add_option('-k', '--key', action='store', dest='key', type='string', help='Static XOR key to use') parser.add_option('-f', '--full', action='store_true', dest='full', help='XOR full file') parser.add_option( '-d', '--dict', action='store', dest='user_dict', help="User supplied dictionary, one word per line (Default: 'en_us')") parser.add_option('-l', '--length', action='store', dest='length', default=4, help="Minimum word length to use (Default: 4)") parser.add_option('-v', '--verbose', action='store_true', dest='verbose', help='Increase verbosity of output.') (options, args) = parser.parse_args() global word_dict # Test for Args if len(sys.argv) < 2: parser.print_help() return # Test that the full option contains a XOR Key if options.full != None and options.key == None: print '[ERROR] --FULL OPTION MUST INCLUDE XOR KEY' return # XOR the full file with key if options.full != None and options.key != None: sys.stdout.write(xor(f.read(), options.key)) return if options.user_dict: # check for file if os.path.isfile(options.user_dict) and os.access( options.user_dict, os.R_OK): word_dict = enchant.request_pwl_dict(options.user_dict) else: print '[ERROR] FILE CAN NOT BE OPENED OR READ!\n' print usage sys.exit(1) else: word_dict = enchant.Dict('en_US') # Parse file for regular expressions return options
documents = prepare_data() # Magic happening with term-document matrix cv = CountVectorizer( lowercase=True, binary=True, token_pattern=r'(?u)\b\w+\b', ngram_range=(1, 3) ) # 1gram: ngram_range=(1,1), 2gram: ngram_range=(2,2), from 1gram to 3gram: ngram_range=(1, 3) ... etc sparse_matrix = cv.fit_transform(documents) dense_matrix = sparse_matrix.todense() td_matrix = dense_matrix.T sparse_td_matrix = sparse_matrix.T.tocsr() t2i = cv.vocabulary_ # dictionary of terms terms = cv.get_feature_names() terms_textfile = enchant.request_pwl_dict( "data100_wordlist_3gram.txt") # Defining a personal wordlist (= pwl) unknownword_list = [] # for similar word suggestions # TF-IDF tfv5 = TfidfVectorizer(lowercase=True, sublinear_tf=True, use_idf=True, norm="l2", token_pattern=r'(?u)\b\w+\b') sparse_matrix = tfv5.fit_transform(documents).T.tocsr() # Make a file of list of words f = open("data100_wordlist_3gram.txt", "w") for k, v in t2i.items(): f.write(k + "\n") f.close()
import numpy import pickle import re import sys from multiprocessing import Pool from nltk.corpus import stopwords from nltk.probability import FreqDist from nltk.stem.lancaster import LancasterStemmer from nltk.stem.porter import PorterStemmer from nltk.stem.wordnet import WordNetLemmatizer CHAT_WORDS = nltk.corpus.nps_chat.words() ENGLISH_WORDS = nltk.corpus.words.words() PWL = enchant.request_pwl_dict('CHAT_WORDS') DICT = enchant.DictWithPWL("en_US", 'chat_words') # First removes punctuations and numbers and adds in a tag for each word. # Then removes the list of stop words # given from nltk. Then uses the lemmatizer then the Lancaster stemmer in # order to get the root of each words. Lemmatizer forms actual words # from the word list dictionary so the words it forms are actual words, # but it doesn't get rid of every stems. However, the Lancaster removes # more stems, but it doesn't check with a dictionary and many of the words # changed are not actual words. def remove_stems(file): new_file = [] punctuation = re.compile(r'[.,"?!:;]') lemmatizer = WordNetLemmatizer()
from nltk.tokenize import word_tokenize from string import punctuation from nltk.corpus import stopwords stuff_to_be_removed = list(stopwords.words("english")) + list(punctuation) import enchant, difflib import pandas as pd import numpy as np import re import pickle import random import nltk file_path = "words.txt" dictionary = enchant.request_pwl_dict(file_path) data = pd.read_excel("data/sampledata_v2.xlsx") for i in data.columns: data[i] = data[i].str.lower() class color: PURPLE = "\033[95m" CYAN = "\033[96m" DARKCYAN = "\033[36m" BLUE = "\033[94m" GREEN = "\033[92m" YELLOW = "\033[93m" RED = "\033[91m" BOLD = "\033[1m" UNDERLINE = "\033[4m"
def test_UnicodeCharsInPath(tmp_path): """Test that unicode chars in PWL paths are accepted.""" _fileName = r"test_\xe5\xe4\xf6_ing" path = tmp_path / _fileName d = request_pwl_dict(str(path)) assert d
# This is so we can use unmodified tests published by third parties corrections = (("caesar","Caesar"),("confucianism","Confucianism"),("february","February"),("gandhi","Gandhi"),("muslims","Muslims"),("israel","Israel")) # List of dictionary objects to test dicts = [] # Number of correct words missed by each dictionary missed = [] # Number of corrections not suggested by each dictionary incorrect = [] # Number of places to find correct suggestion, or -1 if not found dists = [] # Create each dictionary object for prov in providers: if prov == "pypwl": d = enchant.request_pwl_dict(wordsfile) else: b = enchant.Broker() b.set_ordering(lang,prov) d = b.request_dict(lang) if not d.provider.name == prov: raise RuntimeError("Provider '%s' has no dictionary for '%s'"%(prov,lang)) del b dicts.append(d) missed.append([]) incorrect.append([]) dists.append([]) # Actually run the tests testcases = file(datafile,"r") testnum = 0