Beispiel #1
0
class LeetSpeak:
    def __init__(self, processes=1):
        #number of threads
        if processes > 0:
            self.processes = processes
        else:
            self.processes = 1
        #load word frequency and spell checker
        self.spelling = Spelling()
        #load the dictionaries
        self.jargon = Dictionary("slang")
        self.dictionary = self.spelling.dictionary
        self.stopwords = self.spelling.stopwords

        self.a = [
            "a", "4", "@", "/-\\", "/\\", "/_\\", "^", "aye", "ci", "λ", "∂",
            "//-\\\\", "/=\\", "ae"
        ]
        self.b = [
            "b", "8", "|3", "6", "13", "l3", "]3", "|o", "1o", "lo", "ß",
            "]]3", "|8", "l8", "18", "]8"
        ]
        self.c = ["c", "(", "<", "[", "{", "sea", "see", "k", "©", "¢", "€"]
        self.d = [
            "d", "|]", "l]", "1]", "|)", "l)", "1)", "[)", "|}", "l]", "1}",
            "])", "i>", "|>", "l>", "1>", "0", "cl", "o|", "o1", "ol", "Ð",
            "∂", "ð"
        ]
        self.e = ["e", "3", "&", "[-", "€", "ii", "ə", "£", "iii"]
        self.f = ["f", "|=", "]=", "}", "ph", "(=", "[=", "ʃ", "eph", "ph"]
        self.g = [
            "g", "6", "9", "&", "(_+", "C-", "gee", "jee", "(Y,", "cj", "[",
            "-", "(γ,", "(_-"
        ]
        self.h = [
            "h", "|-|", "#", "[-]", "{-}", "]-[", ")-(", "(-)", ":-:", "}{",
            "}-{", "aych", "╫", "]]-[[", "aech"
        ]
        self.i = ["!", "1", "|", "l", "eye", "3y3", "ai", "i"]
        self.j = [
            "j", "_|", "_/", "]", "</", "_)", "_l", "_1", "¿", "ʝ", "ul", "u1",
            "u|", "jay", "(/", "_]"
        ]
        self.k = [
            "k", "x", "|<", "|x", "|{", "/<", "\\<", "/x", "\\x", "ɮ", "kay"
        ]
        self.l = ["l", "1", "7", "|_", "1_", "l_", "lJ", "£", "¬", "el"]
        self.m = [
            "m", "/\/\\", "|\\/|", "em", "|v|", "[v]", "^^", "nn",
            "//\\\\//\\\\", "(V)", "(\/)", "/|\\", "/|/|", ".\\\\", "/^^\\",
            "/V\\", "|^^|", "JVL", "][\\\\//][", "[]\/[]", "[]v[]", "(t)"
        ]
        self.n = [
            "n", "|\\|", "/\\/", "//\\\\//", "[\\]", "<\\>", "{\\}", "//",
            "[]\\[]", "]\\[", "~", "₪", "/|/", "in"
        ]
        #the ω is because Ω is mistakenly taken as that character sometimes...
        self.o = [
            "o", "0", "()", "oh", "[]", "{}", "¤", "Ω", "ω", "*", "[[]]", "oh"
        ]
        self.p = [
            "p", "|*", "l*", "1*", "|o", "lo", "1o", "|>", "l>", "1>", "|\"",
            "l\"", "1\"", "?", "9", "[]d", "|7", "l7", "17", "q", "|d", "ld",
            "1d", "℗", "|º", "1º", "lº", "þ", "¶", "pee"
        ]
        self.q = [
            "q", "0_", "o_", "0,", "o,", "(,)", "[,]", "<|", "<l", "<1", "cue",
            "9", "¶", "kew"
        ]
        self.r = [
            "r", "|2", "l2", "12", "2", "/2", "I2", "|^", "l^", "1^", "|~",
            "l~", "1~", "lz", "[z", "|`", "l`", "1`", ".-", "®", "Я", "ʁ",
            "|?", "l?", "1?", "arr"
        ]
        self.s = ["s", "5", "$", "z", "es", "2", "§", "š", ",,\\``"]
        self.t = ["t", "7", "+", "-|-", "-l-", "-1-", "1", "']['", "†"]
        self.u = [
            "u", "|_|", "l_l", "1_1", "(_)", "[_]", "{_}", "y3w", "m", "\\_/",
            "\\_\\", "/_/", "µ", "yew", "yoo", "yuu"
        ]
        self.v = ["v", "\\/", "\\\\//", "√"]
        self.w = [
            "w", "\\/\\/", "vv", "'//", "\\\\'", "\\^/", "(n)", "\\x/", "\\|/",
            "\\_|_/", "\\_l_/", "\\_1_/", "\\//\\//", "\\_:_/", "]i[", "uu",
            "Ш", "ɰ", "1/\\/", "\\/1/", "1/1/"
        ]
        self.x = [
            "x", "%", "><", "><,", "}{", "ecks", "x", "*", ")(", "ex", "Ж", "×"
        ]
        self.y = [
            "y", "j", "`/", "`(", "-/", "'/", "\\-/", "Ψ", "φ", "λ", "Ч", "¥",
            "``//", "\\j", "wai"
        ]
        self.z = ["z", "2", "~/_", "%", "7_", "ʒ", "≥", "`/_"]
        self.zero = ["0", "o", "zero", "cero", "()"]
        self.one = ["1", "won", "one", "l", "|", "]["]
        self.two = ["two", "to", "too", "2", "z"]
        self.three = ["e", "3", "three"]
        self.four = ["4", "four", "for", "fore", "a"]
        self.five = ["5", "five", "s"]
        self.six = ["6", "six", "g"]
        self.seven = ["7", "seven", "t", "l"]
        self.eight = ["8", "eight", "b"]
        self.nine = ["9", "nine", "g"]

        #"0":self.zero,"1":self.one,"2":self.two,"3":self.three,"4":self.four,"5":self.five,"6":self.six,"7":self.seven,"8":self.eight,"9":self.nine
        self.alphabet = {
            "a": self.a,
            "b": self.b,
            "c": self.c,
            "d": self.d,
            "e": self.e,
            "f": self.f,
            "g": self.g,
            "h": self.h,
            "i": self.i,
            "j": self.j,
            "k": self.k,
            "l": self.l,
            "m": self.m,
            "n": self.n,
            "o": self.o,
            "p": self.p,
            "q": self.q,
            "r": self.r,
            "s": self.s,
            "t": self.t,
            "u": self.u,
            "v": self.v,
            "w": self.w,
            "x": self.x,
            "y": self.y,
            "z": self.z
        }

    def ConvertToLeet(self, text):
        """
        This is fairly straightforward. Randomly select letters from the array of letters and output it.
        """
        leet = ""

        for letter in list(text):
            if letter.isalpha() and self.alphabet[letter.lower()]:
                values = self.alphabet[letter.lower()]
                random.seed()
                number = random.randint(1, len(values))
                leet += values[number - 1]
            else:
                leet += letter

        return leet

    def rec_parse(self, text, previous=[]):
        """
        Input: 
        Output: 
        """
        possibilities = []
        text_length = len(list(text))

        if text_length > 7:
            length = 8
        else:
            length = text_length

        for q in range(1, length):
            if q < len(text):
                possibilities.append(previous +
                                     [text[0:q], text[q:text_length]])
                possibilities += self.rec_parse(text[q:text_length],
                                                previous + [text[0:q]])

        return possibilities

    def rec_scan_array(self, array, previous=[]):
        """
        Input: [['h'], ['e'], ['i', 'l', 't'], ['i', 'l', 't'], ['d', 'o']]
        Output:
         ['h','e','i','i','d'],
         ['h','e','i','i','o'],
         ['h','e','i','1','d'],
         ['h','e','i','1','o'],
         ...
        """

        words = []

        passon = copy.copy(array)
        passon.pop(0)

        if len(array) > 0:
            for let in array[0]:
                letters = copy.copy(previous)
                letters.append(let)

                if len(passon) > 0:
                    words += self.rec_scan_array(passon, letters)
                if len(array) == 1:
                    words.append("".join(letters))

                del letters

        del passon

        return words

    def ConvertFromLeet(self, text):
        """
        Convert leet to readable English text. Find all possible words, check which are English, check for misspellings, etc.
        
        Uses self.processes, so when creating the LeetSpeak() object, you can specify the number of threads to use: l=LeetSpeak(threads=3)
        """
        #figure out how many words each thread should work on
        split = text.split(" ")
        thread_count = {}
        thread_words = {}
        thread_num = 1

        for word in split:
            #add word to the array for the current thread
            if thread_num in thread_count:
                thread_count[thread_num] += 1
            else:
                thread_count[thread_num] = 1
                thread_words[thread_num] = []

            #up the thread_num unless it is currently at the number of threads we want, then set it to 1 to start over again
            if self.processes > thread_num:
                thread_num += 1
            else:
                thread_num = 1

        #compute what words each thread should decode
        for num, word in enumerate(split):
            for thread, words in thread_words.items():
                if len(words) < thread_count[thread]:
                    thread_words[thread].append(word)
                    break

        #INFORMATION:
        #if self.processes = 3 and text = "cows are cool or not", thread_words={1: ['cows', 'are'], 2: ['cool', 'or'], 3: ['not']}

        #create the processes
        threads = {}
        num_threads = len(thread_words)
        result_english = ""
        thread_results = {}
        receive_pipe, send_pipe = Pipe()

        for i in range(self.processes):
            if num_threads >= i + 1:
                threads[i] = Process(target=self.ConvertFromLeet_thread,
                                     args=(thread_words[i + 1], i, send_pipe))
                threads[i].start()

        #start and wait for threads
        for i in range(self.processes):
            if num_threads >= i + 1:
                threads[i].join()
                result = receive_pipe.recv()
                thread_results[result[0]] = result[1]

        #close the pipe
        send_pipe.close()

        #sort the results
        thread_results = sorted(thread_results.items())

        #make a string out of the results
        for thread, string in thread_results:
            result_english += string + " "

        return result_english.strip()

    def ConvertFromLeet_thread(self, text, thread_id, pipe):
        """
        The function that ConvertFromLeet() calls for each thread.
        """
        english = []

        #convert each word
        for word in text:
            #get all the character locations less than 8 (e.g. "c,ow", "co,w", and "cow" for "cow")
            #this uses some recursive substringing
            possibilities = self.rec_parse(word.lower())

            #append the actual "word" if it is less than 8 characters, since it might be a single letter (e.g. "n" for "and")
            if len(word) <= 8:
                possibilities.append([word.lower()])

            #calculate what this could be in leet (if it can be anything)
            validwords = []
            for possibility in possibilities:
                letters = []
                valid = 1
                for char in possibility:
                    chars = []
                    for let, val in self.alphabet.items():
                        if char in val:
                            chars.append(let)
                    if len(chars) == 0:
                        valid = 0
                        break
                    else:
                        letters.append(chars)

                    del chars
                if valid == 1 and len(letters) > 0:
                    #generate possible words from given letters
                    words = self.rec_scan_array(letters)
                    validwords += words
                    del words

            #print(validwords)

            #check which valid words are english if there's more than one option
            #go with the most frequently used english word
            if len(validwords) > 0:
                englishwords = {}

                for valid in validwords:
                    score = 1 + 5 / len(valid)

                    #computer talk
                    if self.jargon.Contains(valid) == True:
                        value = 2
                        jargon = self.jargon.Translate(valid)

                        if self.dictionary.Contains(jargon) == True:
                            value = 4

                        score += value

                        if len(jargon) > 0:
                            if jargon in englishwords:
                                englishwords[jargon] += value
                            else:
                                englishwords[jargon] = score

                            score = 0
                    #valid english
                    if len(valid) > 1 and self.dictionary.Contains(
                            valid) == True:
                        score += 5
                    #frequency words
                    if self.stopwords.Contains(valid):
                        score += self.spelling.Frequency(valid)
                    else:
                        score += 5 * self.spelling.Frequency(valid)
                    #same length
                    if len(word) == len(valid):
                        score += 0.1
                    #no numbers
                    if valid.isalpha() == True:
                        score += 1

                    englishwords[valid] = score

                #figure out what word is the most likely to be correctable
                check = []
                skip = 0
                for valid in englishwords:
                    if valid.isalpha():
                        #if there is already a good word in the list, then don't bother with looking up spell corrections
                        if self.dictionary.Contains(
                                valid) and len(valid) >= len(word) / 2:
                            skip = 1
                            check = []
                            break
                        else:
                            check.append(valid)
                if len(check) == 0 and skip == 0:
                    check.append(englishwords[0])
                #append the corrected version, hopefully
                for item in check:
                    corrected = self.spelling.Check(item,
                                                    dictionary=True,
                                                    fast=True)
                    if corrected != False and len(corrected) > 0:
                        word = corrected[0]

                        if word not in englishwords:
                            frequency = self.spelling.Frequency(word)
                            #if it is on the stop list, don't add as much weight
                            if self.stopwords.Contains(word):
                                value = frequency + 1
                            else:
                                value = 5 * frequency + 1
                            #add weight if in the dictionary
                            if self.dictionary.Contains(word) == True:
                                value += 1
                            #add weight if not numbers
                            if word.isalpha() == True:
                                value += 1
                            englishwords[word] = value
                        else:
                            #if one of the corrected words list is in the englishwords list then up that value by 0.1
                            for correct in corrected:
                                if correct in englishwords:
                                    englishwords[correct] += 0.1

                #get the most likely word
                final = sorted(englishwords.items(),
                               key=operator.itemgetter(1),
                               reverse=True)[0]
                #add word
                english.append(final[0])

        #send the result
        pipe.send([thread_id, " ".join(english)])
class Spelling:
    def __init__(self):
        self.alphabet='abcdefghijklmnopqrstuvwxyz'
        self.guten="data/gutenburg_small.txt"
        self.guten_pickle="data/gutenburg_small.pickle"
        #self.american="words/american-english"
        self.gutenburg={}
        self.learned={}
        self.dictionary=Dictionary("usa")
        self.stopwords=Dictionary("stopwords")

        #self.Load_dictionary()
        self.Load_gutenburg()
        self.Load_learned()

    def Load_gutenburg(self):
        with open(self.guten,encoding='utf-8') as dictionary_file:
            for line in dictionary_file:
                words=line.strip().split(" ")
                length=len(words)

                if length == 2:
                    self.gutenburg[words[1]]=words[0]

    #def Load_dictionary(self):
    #    with open('words/american-english', encoding='utf-8') as dictionary_file:
    #        for line in dictionary_file:
    #            word=line.strip()
    #            length=len(word)
    #
    #            if length > 1:
    #                self.dictionary.append(word)

    def Load_learned(self):
        """
        Load the metaphone array, and if it doesn't exist, create it.
        """
        if len(self.learned) == 0:
            if os.path.exists(self.guten_pickle):
                with open(self.guten_pickle, 'rb') as infile:
                    self.learned=pickle.load(infile)
            else:
                if len(self.gutenburg) == 0:
                    self.Load_dict()

                for word,times in self.gutenburg.items():
                    meta=metaphone(word.replace("'",""))

                    #add and up the frequency of the words
                    if meta not in self.learned:
                        self.learned[meta]={word:int(times)}
                    elif word not in self.learned[meta]:
                        self.learned[meta][word]=int(times)

                if len(self.learned) > 0:
                    with open(self.guten_pickle, 'wb') as outfile:
                            pickle.dump(self.learned, outfile)

    def slight_edits(self,word):
        """
        Find all edits within one character of the word.
        """
        splits=[(word[:i], word[i:]) for i in range(len(word) + 1)]
        deletes=[a + b[1:] for a, b in splits if b]
        transposes=[a + b[1] + b[0] + b[2:] for a, b in splits if len(b)>1]
        replaces=[a + c + b[1:] for a, b in splits for c in self.alphabet if b]
        inserts=[a + c + b for a, b in splits for c in self.alphabet]
        return set(deletes + transposes + replaces + inserts)

    def letters_off(self,word):
        """
        Find all edits within two characters of the word.
        """
        return set(e2 for e1 in self.slight_edits(word) for e2 in self.slight_edits(e1))

    def known_guten(self,words):
        """
        Determine if the word is in the gutenburg dictionary file.
        """
        return set(w for w in words if w in self.gutenburg and int(self.gutenburg[w]) > 100)

    def known_usa(self,words):
        """
        Determine if the word is in the USA dictionary file.
        """
        return set(w for w in words if self.dictionary.Contains(w) == True)

    def highest_likely(self,words):
        """
        Find the word out of 'words' with the highest frequency in the 'self.gutenburg' array Used in max_look_like and Check
        """
        values={}
        likely=[]

        #get the frequency of the word
        for i in words:
            if i in self.gutenburg:
                values[i]=int(self.gutenburg.get(i))

        #sort by the frequency
        sort=sorted(values.items(),key=operator.itemgetter(1),reverse=1)

        #create an array of the sorted words
        for value in sort:
            if len(value) == 2:
                likely.append(value[0])

        #return the array
        if len(likely) > 0:
            return likely
        else:
            return False

    def max_look_like(self,word,fast=False):
        """
        Find all the words that are likely to be mistakes from when the user knows how to spell it but accidently types it in wrong.
        """
        if fast==True:
            words=self.known_usa([word]) or self.known_usa(self.slight_edits(word)) or self.known_usa(self.letters_off(word)) or [word]
        else:
            words=self.known_usa([word]) or self.known_usa(self.slight_edits(word)) or self.known_usa(self.letters_off(word)) or self.known_guten(self.slight_edits(word)) or self.known_guten(self.letters_off(word)) or [word]

        return self.highest_likely(words)

    def max_sound_like(self,word):
        """
        Find all the words that are likely to be mistakes from the user not knowing how to spell it but knowing how it sounds.
        """
        meta=metaphone(word)
        rewords=[]

        if meta in self.learned:
            words=sorted(self.learned[meta].items(),key=operator.itemgetter(1),reverse=1)

            if word not in [i[0] for i in words]:
                if len(words) == 1:
                    rewords.append(words[0][0])
                else:
                    rewords+=[i[0] for i in words]

        if len(rewords) > 0:
            return rewords
        else:
            return False

    def Check(self,word,dictionary=False,fast=False):
        """
        Figure out what the user probably is looking for.

        word - the word to check if it is spelled correctly, and if it isn't, return the correct word
        dictionary - if True, return a dictionary or list of possible corrections
        """
        result=""

        if len(word) > 0:
            look=self.max_look_like(word.lower(),fast=fast)
            sound=self.max_sound_like(word)

            #if one of them is the only option, return the first one
            if look == False and sound == False:
                if dictionary==True:
                    result={}
                else:
                    result=""
            elif look == False:
                if dictionary==True:
                    result=sound
                else:
                    result=sound[0]
            elif sound == False:
                if dictionary==True:
                    result=look
                else:
                    result=look[0]
            #if both have options, find the words in both or find the highest_likely out of all the combined ones
            else:
                #see if the same word is in both lists
                likely=[]
                intersection=set(look)&set(sound)

                if len(intersection) > 0:
                    likely=self.highest_likely(intersection)
                else:
                    likely=self.highest_likely(look+[ent for key,ent in enumerate(sound) if key < 10 and self.dictionary.Contains(ent) == True and self.stopwords.Contains(ent) == False])

                #if there is a likely word, return that word, otherwise let result = "", thus returning False
                if dictionary == True:
                    result=likely
                else:
                    if len(likely) > 0:
                        result=likely[0]

        if result == "" or result == {}:
            return False
        else:
            return result

    def Frequency(self,word):
        result=0
        length=len(self.gutenburg)

        if word in self.gutenburg and length > 0:
            result=int(self.gutenburg.get(word))/length

        return result