class LeetSpeak: def __init__(self, processes=1): #number of threads if processes > 0: self.processes = processes else: self.processes = 1 #load word frequency and spell checker self.spelling = Spelling() #load the dictionaries self.jargon = Dictionary("slang") self.dictionary = self.spelling.dictionary self.stopwords = self.spelling.stopwords self.a = [ "a", "4", "@", "/-\\", "/\\", "/_\\", "^", "aye", "ci", "λ", "∂", "//-\\\\", "/=\\", "ae" ] self.b = [ "b", "8", "|3", "6", "13", "l3", "]3", "|o", "1o", "lo", "ß", "]]3", "|8", "l8", "18", "]8" ] self.c = ["c", "(", "<", "[", "{", "sea", "see", "k", "©", "¢", "€"] self.d = [ "d", "|]", "l]", "1]", "|)", "l)", "1)", "[)", "|}", "l]", "1}", "])", "i>", "|>", "l>", "1>", "0", "cl", "o|", "o1", "ol", "Ð", "∂", "ð" ] self.e = ["e", "3", "&", "[-", "€", "ii", "ə", "£", "iii"] self.f = ["f", "|=", "]=", "}", "ph", "(=", "[=", "ʃ", "eph", "ph"] self.g = [ "g", "6", "9", "&", "(_+", "C-", "gee", "jee", "(Y,", "cj", "[", "-", "(γ,", "(_-" ] self.h = [ "h", "|-|", "#", "[-]", "{-}", "]-[", ")-(", "(-)", ":-:", "}{", "}-{", "aych", "╫", "]]-[[", "aech" ] self.i = ["!", "1", "|", "l", "eye", "3y3", "ai", "i"] self.j = [ "j", "_|", "_/", "]", "</", "_)", "_l", "_1", "¿", "ʝ", "ul", "u1", "u|", "jay", "(/", "_]" ] self.k = [ "k", "x", "|<", "|x", "|{", "/<", "\\<", "/x", "\\x", "ɮ", "kay" ] self.l = ["l", "1", "7", "|_", "1_", "l_", "lJ", "£", "¬", "el"] self.m = [ "m", "/\/\\", "|\\/|", "em", "|v|", "[v]", "^^", "nn", "//\\\\//\\\\", "(V)", "(\/)", "/|\\", "/|/|", ".\\\\", "/^^\\", "/V\\", "|^^|", "JVL", "][\\\\//][", "[]\/[]", "[]v[]", "(t)" ] self.n = [ "n", "|\\|", "/\\/", "//\\\\//", "[\\]", "<\\>", "{\\}", "//", "[]\\[]", "]\\[", "~", "₪", "/|/", "in" ] #the ω is because Ω is mistakenly taken as that character sometimes... self.o = [ "o", "0", "()", "oh", "[]", "{}", "¤", "Ω", "ω", "*", "[[]]", "oh" ] self.p = [ "p", "|*", "l*", "1*", "|o", "lo", "1o", "|>", "l>", "1>", "|\"", "l\"", "1\"", "?", "9", "[]d", "|7", "l7", "17", "q", "|d", "ld", "1d", "℗", "|º", "1º", "lº", "þ", "¶", "pee" ] self.q = [ "q", "0_", "o_", "0,", "o,", "(,)", "[,]", "<|", "<l", "<1", "cue", "9", "¶", "kew" ] self.r = [ "r", "|2", "l2", "12", "2", "/2", "I2", "|^", "l^", "1^", "|~", "l~", "1~", "lz", "[z", "|`", "l`", "1`", ".-", "®", "Я", "ʁ", "|?", "l?", "1?", "arr" ] self.s = ["s", "5", "$", "z", "es", "2", "§", "š", ",,\\``"] self.t = ["t", "7", "+", "-|-", "-l-", "-1-", "1", "']['", "†"] self.u = [ "u", "|_|", "l_l", "1_1", "(_)", "[_]", "{_}", "y3w", "m", "\\_/", "\\_\\", "/_/", "µ", "yew", "yoo", "yuu" ] self.v = ["v", "\\/", "\\\\//", "√"] self.w = [ "w", "\\/\\/", "vv", "'//", "\\\\'", "\\^/", "(n)", "\\x/", "\\|/", "\\_|_/", "\\_l_/", "\\_1_/", "\\//\\//", "\\_:_/", "]i[", "uu", "Ш", "ɰ", "1/\\/", "\\/1/", "1/1/" ] self.x = [ "x", "%", "><", "><,", "}{", "ecks", "x", "*", ")(", "ex", "Ж", "×" ] self.y = [ "y", "j", "`/", "`(", "-/", "'/", "\\-/", "Ψ", "φ", "λ", "Ч", "¥", "``//", "\\j", "wai" ] self.z = ["z", "2", "~/_", "%", "7_", "ʒ", "≥", "`/_"] self.zero = ["0", "o", "zero", "cero", "()"] self.one = ["1", "won", "one", "l", "|", "]["] self.two = ["two", "to", "too", "2", "z"] self.three = ["e", "3", "three"] self.four = ["4", "four", "for", "fore", "a"] self.five = ["5", "five", "s"] self.six = ["6", "six", "g"] self.seven = ["7", "seven", "t", "l"] self.eight = ["8", "eight", "b"] self.nine = ["9", "nine", "g"] #"0":self.zero,"1":self.one,"2":self.two,"3":self.three,"4":self.four,"5":self.five,"6":self.six,"7":self.seven,"8":self.eight,"9":self.nine self.alphabet = { "a": self.a, "b": self.b, "c": self.c, "d": self.d, "e": self.e, "f": self.f, "g": self.g, "h": self.h, "i": self.i, "j": self.j, "k": self.k, "l": self.l, "m": self.m, "n": self.n, "o": self.o, "p": self.p, "q": self.q, "r": self.r, "s": self.s, "t": self.t, "u": self.u, "v": self.v, "w": self.w, "x": self.x, "y": self.y, "z": self.z } def ConvertToLeet(self, text): """ This is fairly straightforward. Randomly select letters from the array of letters and output it. """ leet = "" for letter in list(text): if letter.isalpha() and self.alphabet[letter.lower()]: values = self.alphabet[letter.lower()] random.seed() number = random.randint(1, len(values)) leet += values[number - 1] else: leet += letter return leet def rec_parse(self, text, previous=[]): """ Input: Output: """ possibilities = [] text_length = len(list(text)) if text_length > 7: length = 8 else: length = text_length for q in range(1, length): if q < len(text): possibilities.append(previous + [text[0:q], text[q:text_length]]) possibilities += self.rec_parse(text[q:text_length], previous + [text[0:q]]) return possibilities def rec_scan_array(self, array, previous=[]): """ Input: [['h'], ['e'], ['i', 'l', 't'], ['i', 'l', 't'], ['d', 'o']] Output: ['h','e','i','i','d'], ['h','e','i','i','o'], ['h','e','i','1','d'], ['h','e','i','1','o'], ... """ words = [] passon = copy.copy(array) passon.pop(0) if len(array) > 0: for let in array[0]: letters = copy.copy(previous) letters.append(let) if len(passon) > 0: words += self.rec_scan_array(passon, letters) if len(array) == 1: words.append("".join(letters)) del letters del passon return words def ConvertFromLeet(self, text): """ Convert leet to readable English text. Find all possible words, check which are English, check for misspellings, etc. Uses self.processes, so when creating the LeetSpeak() object, you can specify the number of threads to use: l=LeetSpeak(threads=3) """ #figure out how many words each thread should work on split = text.split(" ") thread_count = {} thread_words = {} thread_num = 1 for word in split: #add word to the array for the current thread if thread_num in thread_count: thread_count[thread_num] += 1 else: thread_count[thread_num] = 1 thread_words[thread_num] = [] #up the thread_num unless it is currently at the number of threads we want, then set it to 1 to start over again if self.processes > thread_num: thread_num += 1 else: thread_num = 1 #compute what words each thread should decode for num, word in enumerate(split): for thread, words in thread_words.items(): if len(words) < thread_count[thread]: thread_words[thread].append(word) break #INFORMATION: #if self.processes = 3 and text = "cows are cool or not", thread_words={1: ['cows', 'are'], 2: ['cool', 'or'], 3: ['not']} #create the processes threads = {} num_threads = len(thread_words) result_english = "" thread_results = {} receive_pipe, send_pipe = Pipe() for i in range(self.processes): if num_threads >= i + 1: threads[i] = Process(target=self.ConvertFromLeet_thread, args=(thread_words[i + 1], i, send_pipe)) threads[i].start() #start and wait for threads for i in range(self.processes): if num_threads >= i + 1: threads[i].join() result = receive_pipe.recv() thread_results[result[0]] = result[1] #close the pipe send_pipe.close() #sort the results thread_results = sorted(thread_results.items()) #make a string out of the results for thread, string in thread_results: result_english += string + " " return result_english.strip() def ConvertFromLeet_thread(self, text, thread_id, pipe): """ The function that ConvertFromLeet() calls for each thread. """ english = [] #convert each word for word in text: #get all the character locations less than 8 (e.g. "c,ow", "co,w", and "cow" for "cow") #this uses some recursive substringing possibilities = self.rec_parse(word.lower()) #append the actual "word" if it is less than 8 characters, since it might be a single letter (e.g. "n" for "and") if len(word) <= 8: possibilities.append([word.lower()]) #calculate what this could be in leet (if it can be anything) validwords = [] for possibility in possibilities: letters = [] valid = 1 for char in possibility: chars = [] for let, val in self.alphabet.items(): if char in val: chars.append(let) if len(chars) == 0: valid = 0 break else: letters.append(chars) del chars if valid == 1 and len(letters) > 0: #generate possible words from given letters words = self.rec_scan_array(letters) validwords += words del words #print(validwords) #check which valid words are english if there's more than one option #go with the most frequently used english word if len(validwords) > 0: englishwords = {} for valid in validwords: score = 1 + 5 / len(valid) #computer talk if self.jargon.Contains(valid) == True: value = 2 jargon = self.jargon.Translate(valid) if self.dictionary.Contains(jargon) == True: value = 4 score += value if len(jargon) > 0: if jargon in englishwords: englishwords[jargon] += value else: englishwords[jargon] = score score = 0 #valid english if len(valid) > 1 and self.dictionary.Contains( valid) == True: score += 5 #frequency words if self.stopwords.Contains(valid): score += self.spelling.Frequency(valid) else: score += 5 * self.spelling.Frequency(valid) #same length if len(word) == len(valid): score += 0.1 #no numbers if valid.isalpha() == True: score += 1 englishwords[valid] = score #figure out what word is the most likely to be correctable check = [] skip = 0 for valid in englishwords: if valid.isalpha(): #if there is already a good word in the list, then don't bother with looking up spell corrections if self.dictionary.Contains( valid) and len(valid) >= len(word) / 2: skip = 1 check = [] break else: check.append(valid) if len(check) == 0 and skip == 0: check.append(englishwords[0]) #append the corrected version, hopefully for item in check: corrected = self.spelling.Check(item, dictionary=True, fast=True) if corrected != False and len(corrected) > 0: word = corrected[0] if word not in englishwords: frequency = self.spelling.Frequency(word) #if it is on the stop list, don't add as much weight if self.stopwords.Contains(word): value = frequency + 1 else: value = 5 * frequency + 1 #add weight if in the dictionary if self.dictionary.Contains(word) == True: value += 1 #add weight if not numbers if word.isalpha() == True: value += 1 englishwords[word] = value else: #if one of the corrected words list is in the englishwords list then up that value by 0.1 for correct in corrected: if correct in englishwords: englishwords[correct] += 0.1 #get the most likely word final = sorted(englishwords.items(), key=operator.itemgetter(1), reverse=True)[0] #add word english.append(final[0]) #send the result pipe.send([thread_id, " ".join(english)])
class Spelling: def __init__(self): self.alphabet='abcdefghijklmnopqrstuvwxyz' self.guten="data/gutenburg_small.txt" self.guten_pickle="data/gutenburg_small.pickle" #self.american="words/american-english" self.gutenburg={} self.learned={} self.dictionary=Dictionary("usa") self.stopwords=Dictionary("stopwords") #self.Load_dictionary() self.Load_gutenburg() self.Load_learned() def Load_gutenburg(self): with open(self.guten,encoding='utf-8') as dictionary_file: for line in dictionary_file: words=line.strip().split(" ") length=len(words) if length == 2: self.gutenburg[words[1]]=words[0] #def Load_dictionary(self): # with open('words/american-english', encoding='utf-8') as dictionary_file: # for line in dictionary_file: # word=line.strip() # length=len(word) # # if length > 1: # self.dictionary.append(word) def Load_learned(self): """ Load the metaphone array, and if it doesn't exist, create it. """ if len(self.learned) == 0: if os.path.exists(self.guten_pickle): with open(self.guten_pickle, 'rb') as infile: self.learned=pickle.load(infile) else: if len(self.gutenburg) == 0: self.Load_dict() for word,times in self.gutenburg.items(): meta=metaphone(word.replace("'","")) #add and up the frequency of the words if meta not in self.learned: self.learned[meta]={word:int(times)} elif word not in self.learned[meta]: self.learned[meta][word]=int(times) if len(self.learned) > 0: with open(self.guten_pickle, 'wb') as outfile: pickle.dump(self.learned, outfile) def slight_edits(self,word): """ Find all edits within one character of the word. """ splits=[(word[:i], word[i:]) for i in range(len(word) + 1)] deletes=[a + b[1:] for a, b in splits if b] transposes=[a + b[1] + b[0] + b[2:] for a, b in splits if len(b)>1] replaces=[a + c + b[1:] for a, b in splits for c in self.alphabet if b] inserts=[a + c + b for a, b in splits for c in self.alphabet] return set(deletes + transposes + replaces + inserts) def letters_off(self,word): """ Find all edits within two characters of the word. """ return set(e2 for e1 in self.slight_edits(word) for e2 in self.slight_edits(e1)) def known_guten(self,words): """ Determine if the word is in the gutenburg dictionary file. """ return set(w for w in words if w in self.gutenburg and int(self.gutenburg[w]) > 100) def known_usa(self,words): """ Determine if the word is in the USA dictionary file. """ return set(w for w in words if self.dictionary.Contains(w) == True) def highest_likely(self,words): """ Find the word out of 'words' with the highest frequency in the 'self.gutenburg' array Used in max_look_like and Check """ values={} likely=[] #get the frequency of the word for i in words: if i in self.gutenburg: values[i]=int(self.gutenburg.get(i)) #sort by the frequency sort=sorted(values.items(),key=operator.itemgetter(1),reverse=1) #create an array of the sorted words for value in sort: if len(value) == 2: likely.append(value[0]) #return the array if len(likely) > 0: return likely else: return False def max_look_like(self,word,fast=False): """ Find all the words that are likely to be mistakes from when the user knows how to spell it but accidently types it in wrong. """ if fast==True: words=self.known_usa([word]) or self.known_usa(self.slight_edits(word)) or self.known_usa(self.letters_off(word)) or [word] else: words=self.known_usa([word]) or self.known_usa(self.slight_edits(word)) or self.known_usa(self.letters_off(word)) or self.known_guten(self.slight_edits(word)) or self.known_guten(self.letters_off(word)) or [word] return self.highest_likely(words) def max_sound_like(self,word): """ Find all the words that are likely to be mistakes from the user not knowing how to spell it but knowing how it sounds. """ meta=metaphone(word) rewords=[] if meta in self.learned: words=sorted(self.learned[meta].items(),key=operator.itemgetter(1),reverse=1) if word not in [i[0] for i in words]: if len(words) == 1: rewords.append(words[0][0]) else: rewords+=[i[0] for i in words] if len(rewords) > 0: return rewords else: return False def Check(self,word,dictionary=False,fast=False): """ Figure out what the user probably is looking for. word - the word to check if it is spelled correctly, and if it isn't, return the correct word dictionary - if True, return a dictionary or list of possible corrections """ result="" if len(word) > 0: look=self.max_look_like(word.lower(),fast=fast) sound=self.max_sound_like(word) #if one of them is the only option, return the first one if look == False and sound == False: if dictionary==True: result={} else: result="" elif look == False: if dictionary==True: result=sound else: result=sound[0] elif sound == False: if dictionary==True: result=look else: result=look[0] #if both have options, find the words in both or find the highest_likely out of all the combined ones else: #see if the same word is in both lists likely=[] intersection=set(look)&set(sound) if len(intersection) > 0: likely=self.highest_likely(intersection) else: likely=self.highest_likely(look+[ent for key,ent in enumerate(sound) if key < 10 and self.dictionary.Contains(ent) == True and self.stopwords.Contains(ent) == False]) #if there is a likely word, return that word, otherwise let result = "", thus returning False if dictionary == True: result=likely else: if len(likely) > 0: result=likely[0] if result == "" or result == {}: return False else: return result def Frequency(self,word): result=0 length=len(self.gutenburg) if word in self.gutenburg and length > 0: result=int(self.gutenburg.get(word))/length return result