Esempio n. 1
0
    def test_words_with_shared_prefix_should_retain_counts(self):
        print('  - %s' % inspect.stack()[0][3])
        sym_spell = SymSpell(16, 1, 3)
        sym_spell.create_dictionary_entry("pipe", 5)
        sym_spell.create_dictionary_entry("pips", 10)

        result = sym_spell.lookup("pipe", Verbosity.ALL, 1)
        self.assertEqual(2, len(result))
        self.assertEqual("pipe", result[0].term)
        self.assertEqual(5, result[0].count)
        self.assertEqual("pips", result[1].term)
        self.assertEqual(10, result[1].count)

        result = sym_spell.lookup("pips", Verbosity.ALL, 1)
        self.assertEqual(2, len(result))
        self.assertEqual("pips", result[0].term)
        self.assertEqual(10, result[0].count)
        self.assertEqual("pipe", result[1].term)
        self.assertEqual(5, result[1].count)

        result = sym_spell.lookup("pip", Verbosity.ALL, 1)
        self.assertEqual(2, len(result))
        self.assertEqual("pips", result[0].term)
        self.assertEqual(10, result[0].count)
        self.assertEqual("pipe", result[1].term)
        self.assertEqual(5, result[1].count)
Esempio n. 2
0
class Autocorrect:
    def __init__(self, words=None, max_edit_distance=2):
        self._symspell = SymSpell()
        self._max_edit_distance = max_edit_distance
        if words is not None:
            self.add_words(words)

    def add_word(self, word):
        if word is not None:
            self._symspell.create_dictionary_entry(word, 1)

    def add_words(self, words):
        if words is not None:
            self._symspell.create_dictionary(words)

    def delete_word(self, word):
        if word is not None:
            self._symspell.delete_dictionary_entry(word)

    def correct(self, bad_word):
        return self._symspell.lookup(bad_word,
                                     Verbosity.TOP,
                                     max_edit_distance=self._max_edit_distance,
                                     include_unknown=True)[0].term

    def predictions(self, bad_word):
        return self._symspell.lookup(bad_word,
                                     Verbosity.CLOSEST,
                                     max_edit_distance=self._max_edit_distance,
                                     include_unknown=True)
Esempio n. 3
0
    def test_words_from_list_with_shared_prefix_should_retain_counts(self):
        print('  - %s' % inspect.stack()[0][3])
        sym_spell = SymSpell(16,
                             1,
                             3,
                             words=[
                                 "pipe", "pipe", "pipe", "pipe", "pipe",
                                 "pips", "pips", "pips", "pips", "pips",
                                 "pips", "pips", "pips", "pips", "pips"
                             ])

        result = sym_spell.lookup("pipe", Verbosity.ALL, 1)
        self.assertEqual(2, len(result))
        self.assertEqual("pipe", result[0].term)
        self.assertEqual(5, result[0].count)
        self.assertEqual("pips", result[1].term)
        self.assertEqual(10, result[1].count)

        result = sym_spell.lookup("pips", Verbosity.ALL, 1)
        self.assertEqual(2, len(result))
        self.assertEqual("pips", result[0].term)
        self.assertEqual(10, result[0].count)
        self.assertEqual("pipe", result[1].term)
        self.assertEqual(5, result[1].count)

        result = sym_spell.lookup("pip", Verbosity.ALL, 1)
        self.assertEqual(2, len(result))
        self.assertEqual("pips", result[0].term)
        self.assertEqual(10, result[0].count)
        self.assertEqual("pipe", result[1].term)
        self.assertEqual(5, result[1].count)
Esempio n. 4
0
 def test_lookup_should_not_return_non_word_delete(self):
     print('  - %s' % inspect.stack()[0][3])
     sym_spell = SymSpell(16, 2, 7, 10)
     sym_spell.create_dictionary_entry("pawn", 10)
     result = sym_spell.lookup("paw", Verbosity.TOP, 0)
     self.assertEqual(0, len(result))
     result = sym_spell.lookup("awn", Verbosity.TOP, 0)
     self.assertEqual(0, len(result))
Esempio n. 5
0
    def test_verbosity_should_control_lookup_results(self):
        print('  - %s' % inspect.stack()[0][3])
        sym_spell = SymSpell()
        sym_spell.create_dictionary_entry("steam", 1)
        sym_spell.create_dictionary_entry("steams", 2)
        sym_spell.create_dictionary_entry("steem", 3)

        result = sym_spell.lookup("steems", Verbosity.TOP, 2)
        self.assertEqual(1, len(result))
        result = sym_spell.lookup("steems", Verbosity.CLOSEST, 2)
        self.assertEqual(2, len(result))
        result = sym_spell.lookup("steems", Verbosity.ALL, 2)
        self.assertEqual(3, len(result))
Esempio n. 6
0
    def test_add_additional_counts_should_increase_count(self):
        print('  - %s' % inspect.stack()[0][3])
        sym_spell = SymSpell()
        word = "hello"
        sym_spell.create_dictionary_entry(word, 11)
        result = sym_spell.lookup(word, Verbosity.TOP)
        count = result[0].count if len(result) == 1 else 0
        self.assertEqual(11, count)

        sym_spell.create_dictionary_entry(word, 3)
        result = sym_spell.lookup(word, Verbosity.TOP)
        count = result[0].count if len(result) == 1 else 0
        self.assertEqual(11 + 3, count)
Esempio n. 7
0
    def test_add_additional_counts_should_not_overflow(self):
        print('  - %s' % inspect.stack()[0][3])
        sym_spell = SymSpell()
        word = "hello"
        sym_spell.create_dictionary_entry(word, sys.maxsize - 10)
        result = sym_spell.lookup(word, Verbosity.TOP)
        count = result[0].count if len(result) == 1 else 0
        self.assertEqual(sys.maxsize - 10, count)

        sym_spell.create_dictionary_entry(word, 11)
        result = sym_spell.lookup(word, Verbosity.TOP)
        count = result[0].count if len(result) == 1 else 0
        self.assertEqual(sys.maxsize, count)
Esempio n. 8
0
def main():
    # maximum edit distance per dictionary precalculation
    max_edit_distance_dictionary = 2
    prefix_length = 7

    # create object
    sym_spell = SymSpell(max_edit_distance_dictionary, prefix_length)

    # load dictionary
    dictionary_path = os.path.join(os.path.dirname(__file__),
                                   "corpus/dictionary/dictionary.txt")
    # dictionary_path = os.path.join(os.path.dirname(__file__), "corpus/symspellpy/frequency_dictionary_en_82_765.txt")
    term_index = 0  # column of the term in the dictionary text file
    count_index = 1  # column of the term frequency in the dictionary text file
    if not sym_spell.load_dictionary(dictionary_path, term_index, count_index):
        print("Dictionary file not found")
        return

    # lookup suggestions for single-word input strings
    input_term = "bangeeet"  # misspelling
    # max edit distance per lookup
    # (max_edit_distance_lookup <= max_edit_distance_dictionary)
    max_edit_distance_lookup = 2
    suggestion_verbosity = Verbosity.CLOSEST  # TOP, CLOSEST, ALL
    suggestions = sym_spell.lookup(input_term, suggestion_verbosity,
                                   max_edit_distance_lookup)
    # display suggestion term, term frequency, and edit distance
    for suggestion in suggestions:
        print("{}, {}, {}".format(suggestion.term, suggestion.distance,
                                  suggestion.count))
Esempio n. 9
0
    def test_lookup_should_replicate_noisy_results(self):
        print('  - %s' % inspect.stack()[0][3])
        cwd = path.realpath(path.dirname(__file__))
        dictionary_path = path.realpath(
            path.join(cwd, pardir, "symspellpy",
                      "frequency_dictionary_en_82_765.txt"))
        query_path = path.join(cwd, "fortests", "noisy_query_en_1000.txt")

        edit_distance_max = 2
        prefix_length = 7
        verbosity = Verbosity.CLOSEST
        sym_spell = SymSpell(83000, edit_distance_max, prefix_length)
        sym_spell.load_dictionary(dictionary_path, 0, 1)

        test_list = []
        with open(query_path, "r") as infile:
            for line in infile.readlines():
                line_parts = line.rstrip().split(" ")
                if len(line_parts) >= 2:
                    test_list.append(line_parts[0])
        result_sum = 0
        for phrase in test_list:
            result_sum += len(
                sym_spell.lookup(phrase, verbosity, edit_distance_max))
        self.assertEqual(4945, result_sum)
Esempio n. 10
0
    class SpellCorrector():
        def __init__(self, max_edit_distance_dictionary=2, prefix_length=7):
            self.sym_spell = SymSpell(max_edit_distance_dictionary, prefix_length)
            # load dictionary
            dictionary_path = os.path.join(os.path.dirname('../'),
                                           "frequency_dictionary_en_82_765.txt")
            term_index = 0  # column of the term in the dictionary text file
            count_index = 1  # column of the term frequency in the dictionary text file
            if not self.sym_spell.load_dictionary(dictionary_path, term_index, count_index):
                raise("Dictionary file not found")

            # manually
            # this works. about 0.003 up
            # self.corr_dict = {"awsome": "awesome"}

        def reduce_lengthening(self, text):
            # not work
            pattern = re.compile(r"(.)\1{2,}")
            return pattern.sub(r"\1\1", text)

        def strip_punc(self, word):
            # not work
            return re.sub(r"[\-\_\.\!]$", "", word)

        def __call__(self, word):
            word = self.reduce_lengthening(word)
            # if word in self.corr_dict:
            #     word = self.corr_dict[word]
            if len(word) > 2 and "'" not in word:
                suggestions = self.sym_spell.lookup(word, Verbosity.CLOSEST, 2)
                if suggestions:
                    return suggestions[0].term
            return word
def spelling_correction(data,column):
    from symspellpy.symspellpy import SymSpell , Verbosity
    # maximum edit distance per dictionary precalculation
    max_edit_distance_dictionary = 2
    prefix_length = 7
    # create object
    sym_spell = SymSpell(max_edit_distance_dictionary, prefix_length)
    # load dictionary
    dictionary_path = "frequency_dictionary_en_82_765.txt"
    term_index = 0  # column of the term in the dictionary text file
    count_index = 1  # column of the term frequency in the dictionary text file
    if not sym_spell.load_dictionary(dictionary_path, term_index, count_index):
       print("Dictionary file not found")

    max_edit_distance_lookup = 2
    suggestion_verbosity = Verbosity.CLOSEST  # TOP, CLOSEST, ALL
    df_final = pd.DataFrame()
    for index , row in data.iterrows():
        # lookup suggestions for single-word input strings
        text = row[column]
        # max edit distance per lookup
        # (max_edit_distance_lookup <= max_edit_distance_dictionary)
        for input_term in text.split():
            suggestions = sym_spell.lookup(input_term, suggestion_verbosity,
                                       max_edit_distance_lookup)
            if len(suggestions)>0:
                df_local = pd.DataFrame({'Original Word':[input_term],'Replacement':[suggestions[0].term]})        
                df_final = df_final.append(df_local)
    return df_final
Esempio n. 12
0
 def test_lookup_should_not_return_low_count_word_that_are_also_delete_word(
         self):
     print('  - %s' % inspect.stack()[0][3])
     sym_spell = SymSpell(16, 2, 7, 10)
     sym_spell.create_dictionary_entry("flame", 20)
     sym_spell.create_dictionary_entry("flam", 1)
     result = sym_spell.lookup("flam", Verbosity.TOP, 0)
     self.assertEqual(0, len(result))
Esempio n. 13
0
 def test_lookup_should_find_exact_match(self):
     print('  - %s' % inspect.stack()[0][3])
     sym_spell = SymSpell()
     sym_spell.create_dictionary_entry("steama", 4)
     sym_spell.create_dictionary_entry("steamb", 6)
     sym_spell.create_dictionary_entry("steamc", 2)
     result = sym_spell.lookup("streama", Verbosity.TOP, 2)
     self.assertEqual(1, len(result))
     self.assertEqual("steama", result[0].term)
Esempio n. 14
0
def symspell_test(tokenpos_list,
                  max_edit_distance_lookup=3,
                  initial_capacity=83000,
                  max_edit_distance_dictionary=3,
                  prefix_length=7,
                  term_index=0,
                  count_index=1):
    """
    This is a function that tests the SymSpell library for spell-checking performance.
    Key-word arguments are:
        ** max_edit_distance_lookup : (Recommended maximum = 3)
        ** term_index : term column in dictionary (0)
        ** count_index : frequency column in dictionary (1)
    """
    print('\n{} \nBegin \'Symspellpy\' testing \n'.format('#' * 20))

    try:
        sym_spell = SymSpell(initial_capacity, max_edit_distance_dictionary,
                             prefix_length)
        suggestion_verbosity = Verbosity.CLOSEST

        dictionary_path = os.path.join(os.path.dirname(__file__),
                                       "frequency_dictionary_en_82_765.txt")
        if not sym_spell.load_dictionary(dictionary_path, term_index,
                                         count_index):
            print("Dictionary file not found")
            return 'Error loading dictionary file'
        suggestion_list = []
        proper_noun = []

        for (word, pos) in tokenpos_list:
            if pos == 'PROPN':
                suggestion_list.append(word)
                proper_noun.append(word)
            elif len(word) < 3:
                suggestion_list.append(word)
                proper_noun.append(word)
            else:
                suggestions = sym_spell.lookup(word, suggestion_verbosity,
                                               max_edit_distance_lookup)
                suggestion = (list(suggestions))[0]
                # display suggestion term, term frequency, and edit distance
                print(
                    "input_term = {}, suggestion_term = {}, suggestion_count = {},\
                suggestion_distance =  {}".format(word, suggestion.term,
                                                  suggestion.count,
                                                  suggestion.distance))
                suggestion_list.append(suggestion.term)
        print("\n\nThe corrected sentence is : {}".format(
            ' '.join(suggestion_list)))
        print(suggestion_list)
        print(proper_noun)
        return suggestion_list, proper_noun
    except TypeError as error:
        print(f'Invalid type : {error}')
        return 405
Esempio n. 15
0
 def test_lookup_should_return_most_frequent(self):
     print('  - %s' % inspect.stack()[0][3])
     sym_spell = SymSpell()
     sym_spell.create_dictionary_entry("steama", 4)
     sym_spell.create_dictionary_entry("steamb", 6)
     sym_spell.create_dictionary_entry("steamc", 2)
     result = sym_spell.lookup("stream", Verbosity.TOP, 2)
     self.assertEqual(1, len(result))
     self.assertEqual("steamb", result[0].term)
     self.assertEqual(6, result[0].count)
Esempio n. 16
0
        def common_keywords(text):
            keyword_data = pd.read_csv(
                'D:/ML/QNA_project/CSV_files/keywords.csv')
            filter_data = pd.read_csv(
                'D:/ML/QNA_project/CSV_files/filters.csv')

            # text = "he lives in bangalor1"
            text = text.lower()
            w = text.split(' ')
            print(w)

            max_edit_distance_dictionary = 2
            prefix_length = 9
            sym_spell = SymSpell(max_edit_distance_dictionary, prefix_length)
            dictionary_path = os.path.join(os.path.dirname(__file__),
                                           "dictionary_final.txt")
            term_index = 0  # column of the term in the dictionary text file
            count_index = 1  # column of the term frequency in the dictionary text file

            if not sym_spell.load_dictionary(dictionary_path, term_index,
                                             count_index):
                print("Dictionary file not found")

            max_edit_distance_lookup = 2
            suggestion_verbosity = Verbosity.CLOSEST

            ques = ""
            for input in w:
                suggestions = sym_spell.lookup(input, max_edit_distance_lookup)
                try:
                    ques = ques + suggestions[0].term + " "
                except:
                    ques = ques + input + " "
            ques = ques + text
            # print(ques)
            words = []
            for i in range(len(keyword_data)):
                str = keyword_data['Keywords'][i]
                str = str.lower()
                if (ques.find(str, 0, len(str)) != -1):
                    words.append(str)

            for i in range(len(filter_data)):
                str = filter_data['Filters'][i]
                str = str.lower()
                if (ques.find(str, 0, len(str)) != -1):
                    words.append(str)

            return len(words)
Esempio n. 17
0
def main():
    # create object
    initial_capacity = 83000
    # maximum edit distance per dictionary precalculation
    max_edit_distance_dictionary = 2
    prefix_length = 7
    sym_spell = SymSpell(initial_capacity, max_edit_distance_dictionary,
                         prefix_length)
    # load dictionary
    dictionary_path = os.path.join(os.path.dirname(__file__),
                                   "frequency_dictionary_en_82_765.txt")
    term_index = 0  # column of the term in the dictionary text file
    count_index = 1  # column of the term frequency in the dictionary text file
    if not sym_spell.load_dictionary(dictionary_path, term_index, count_index):
        print("Dictionary file not found")
        return
    with socket.socket(socket.AF_INET, socket.SOCK_DGRAM) as s:
        s.bind((host, port))
        while (True):
            # lookup suggestions for single-word input strings
            try:
                input_term, source = s.recvfrom(1024)  # Network input
                input_term = input_term.decode()
                print("Test2 Input: {}".format(input_term))
                # max edit distance per lookup
                # (max_edit_distance_lookup <= max_edit_distance_dictionary)
                max_edit_distance_lookup = 2
                suggestion_verbosity = Verbosity.CLOSEST  # TOP, CLOSEST, ALL
                suggestions = sym_spell.lookup(input_term,
                                               suggestion_verbosity,
                                               max_edit_distance_lookup)
                # display suggestion term, term frequency, and edit distance
                for suggestion in suggestions:
                    print("Test2 Output: {}, {}, {}".format(
                        suggestion.term, suggestion.count,
                        suggestion.distance))


#               s.sendto(suggestions[0].term.encode(), source)

                s.sendto(str(len(suggestions)).encode(), source)

                for i in range(0, len(suggestions)):
                    s.sendto(suggestions[i].term.encode(), source)

            except Exception as e:
                print(e)
                time.sleep(1)
Esempio n. 18
0
def main():
    # maximum edit distance per dictionary precalculation
    max_edit_distance_dictionary = 2
    prefix_length = 7
    # create object
    sym_spell = SymSpell(max_edit_distance_dictionary, prefix_length)
    
    # load dictionary
    dictionary_path = os.path.join(os.path.dirname(__file__),
                                   "frequency_dictionary_en_82_765.txt")
    term_index = 0  # column of the term in the dictionary text file
    count_index = 1  # column of the term frequency in the dictionary text file
    if not sym_spell.load_dictionary(dictionary_path, term_index, count_index):
        print("Dictionary file not found")
        return

    # lookup suggestions for single-word input strings
    input_term = "pyth"  # misspelling of "members"
    # max edit distance per lookup
    # (max_edit_distance_lookup <= max_edit_distance_dictionary)
    max_edit_distance_lookup = 2
    suggestion_verbosity = Verbosity.CLOSEST  # TOP, CLOSEST, ALL
    suggestions = sym_spell.lookup(input_term, suggestion_verbosity,
                                   max_edit_distance_lookup)
    # display suggestion term, term frequency, and edit distance
    for suggestion in suggestions:
        print("{}, {}, {}".format(suggestion.term, suggestion.distance,
                                  suggestion.count))

    # lookup suggestions for multi-word input strings (supports compound
    # splitting & merging)
    input_term = ("whereis th elove hehad dated forImuch of thepast who "
                  "couqdn'tread in sixtgrade and ins pired him")
    # max edit distance per lookup (per single word, not per whole input string)
    max_edit_distance_lookup = 2
    suggestions = sym_spell.lookup_compound(input_term,
                                            max_edit_distance_lookup)
    # display suggestion term, edit distance, and term frequency
    for suggestion in suggestions:
        print("{}, {}, {}".format(suggestion.term, suggestion.distance,
                                  suggestion.count))
    # a sentence without any spaces
    input_term = "thequuickbrownfoxjumpsoverthelazydog"
    
    result = sym_spell.word_segmentation(input_term)
    # display suggestion term, term frequency, and edit distance
    print("{}, {}, {}".format(result.corrected_string, result.distance_sum,
                              result.log_prob_sum))
Esempio n. 19
0
def correction(input_term):

    # create object
    sym_spell = SymSpell()
    # load dictionary
    dictionary_path = os.path.join(os.path.dirname(__file__),
                                   "frequency_dictionary_en_82_765.txt")
    term_index = 0  # column of the term in the dictionary text file
    count_index = 1  # column of the term frequency in the dictionary text file
    if not sym_spell.load_dictionary(dictionary_path, term_index, count_index):
        print("Dictionary file not found")
        return

    max_edit_distance_lookup = 2
    suggestion_verbosity = Verbosity.CLOSEST  # TOP, CLOSEST, ALL
    suggestions = sym_spell.lookup(input_term, suggestion_verbosity,
                                   max_edit_distance_lookup)

    suggestions.extend(
        sym_spell.lookup_compound(input_term, max_edit_distance_lookup))

    suggestions = sorted(suggestions, key=lambda x: (x.distance))

    #to remove dupicate objects
    import collections
    seen = collections.OrderedDict()
    for obj in suggestions:
        if obj.term not in seen:
            seen[obj.term] = obj

    suggestions = list(seen.values())

    #when the no correction is needed
    seen = collections.OrderedDict()
    for obj in suggestions:
        if obj.term != input_term:
            seen[obj.term] = obj

    correctWords = list(seen.values())
    if len(correctWords) == 0:
        return


#    for suggestion in suggestions:
#        print("{}, {}, {}".format(suggestion.term, suggestion.distance,
#                                  suggestion.count))
    return suggestions
Esempio n. 20
0
def main():
    # maximum edit distance per dictionary precalculation
    max_edit_distance_dictionary = 2
    prefix_length = 9
    data = pd.read_csv('D:/ML/QNA_project/CSV_files/final_words_total_rd2.csv')

    # create object
    sym_spell = SymSpell(max_edit_distance_dictionary, prefix_length)
    # load dictionary
    dictionary_path = os.path.join(os.path.dirname(__file__),
                                   "frequency_dictionary_en_82_765.txt")
    term_index = 0  # column of the term in the dictionary text file
    count_index = 1  # column of the term frequency in the dictionary text file
    if not sym_spell.load_dictionary(dictionary_path, term_index, count_index):
        print("Dictionary file not found")
        return
    # lookup suggestions for single-word input strings

    # input_term = "agricultr"  # misspelling of "members"
    # max edit distance per lookup
    # (max_edit_distance_lookup <= max_edit_distance_dictionary)
    max_edit_distance_lookup = 2

    suggestion_verbosity = Verbosity.CLOSEST  # TOP, CLOSEST, ALL
    s = ""
    print('original')
    # print(len(words))
    for i in range(len(data)):
        # print(i)
        if i == 0 or i == 51124 or i == 65070:
            continue
        input_term = data['Final_words'][i]
        suggestions = sym_spell.lookup(input_term, suggestion_verbosity,
                                       max_edit_distance_lookup)
        print(i)
        try:
            s = s + str(suggestions[0].term) + " "
        except:
            s = s + input_term

    s = s[:-1]
    words = s.split(' ')
    # print(len(words))
    print('After')
    print(len(words))
Esempio n. 21
0
def extract_misspellings(s):
    global sym_spell
    if sym_spell is None:
        # Initialize SymSpell checker
        # maximum edit distance per dictionary precalculation
        max_edit_distance_dictionary = 2
        prefix_length = 7
        # create object
        sym_spell = SymSpell(max_edit_distance_dictionary, prefix_length)

        # load dictionary
        dictionary_path = pkg_resources.resource_filename(
            "symspellpy", "frequency_dictionary_en_82_765.txt")
        bigram_path = pkg_resources.resource_filename(
            "symspellpy", "frequency_bigramdictionary_en_243_342.txt")
        # term_index is the column of the term and count_index is the
        # column of the term frequency
        if not sym_spell.load_dictionary(
                dictionary_path, term_index=0, count_index=1):
            print("Dictionary file not found")

        if not sym_spell.load_bigram_dictionary(
                bigram_path, term_index=0, count_index=2):
            print("Bigram dictionary file not found")

    max_edit_distance_lookup = 2
    suggestion_verbosity = Verbosity.CLOSEST  # TOP, CLOSEST, ALL

    # Start correcting word by word
    article_text = s.split()
    misspelled = 0
    for word in article_text:
        word = word.strip()
        suggestions = sym_spell.lookup(word, suggestion_verbosity,
                                       max_edit_distance_lookup)
        # Correct the text
        if len(suggestions) == 0:
            continue
        sug = suggestions[0]
        if sug.term != word:
            s = re.sub("\s+" + word + "\s+", " " + sug.term + " ", s)
            misspelled = misspelled + 1
    mpw = misspelled / len(article_text)

    return mpw, s
Esempio n. 22
0
def main():
    # create object
    initial_capacity = 83000
    # maximum edit distance per dictionary precalculation
    max_edit_distance_dictionary = 2
    prefix_length = 7
    sym_spell = SymSpell(initial_capacity, max_edit_distance_dictionary,
                         prefix_length)
    # load dictionary
    dictionary_path = os.path.join(
        "/Users/meheresh/Documents/cm_spellchecker/spellcheck/data",
        "freqdict.txt")
    term_index = 0  # column of the term in the dictionary text file
    count_index = 1  # column of the term frequency in the dictionary text file
    if not sym_spell.load_dictionary(dictionary_path, term_index, count_index):
        print("Dictionary file not found")
        return

    # lookup suggestions for single-word input strings
    input_term = "memebers"  # misspelling of "members"
    # max edit distance per lookup
    # (max_edit_distance_lookup <= max_edit_distance_dictionary)
    max_edit_distance_lookup = 2
    suggestion_verbosity = Verbosity.CLOSEST  # TOP, CLOSEST, ALL
    suggestions = sym_spell.lookup(input_term, suggestion_verbosity,
                                   max_edit_distance_lookup)
    # display suggestion term, term frequency, and edit distance
    for suggestion in suggestions:
        print("{}, {}, {}".format(suggestion.term, suggestion.count,
                                  suggestion.distance))

    # lookup suggestions for multi-word input strings (supports compound
    # splitting & merging)
    input_term = ("whereis th elove hehad dated forImuch of thepast who "
                  "couqdn'tread in sixtgrade and ins pired him")
    # max edit distance per lookup (per single word, not per whole input string)
    max_edit_distance_lookup = 2
    suggestions = sym_spell.lookup_compound(input_term,
                                            max_edit_distance_lookup)
    # display suggestion term, edit distance, and term frequency
    for suggestion in suggestions:
        print("{}, {}, {}".format(suggestion.term, suggestion.count,
                                  suggestion.distance))
Esempio n. 23
0
def main(argv):
    if len(argv) == 3:
        input = argv[1]
        markdown = argv[2]
    else:
        print('usage:\n    python .py "<categoria>" <markdown gerado>')
        return
    initial_capacity = 83000
    max_edit_distance_dictionary = 3
    prefix_length = 7
    sym_spell = SymSpell(initial_capacity, max_edit_distance_dictionary,
                         prefix_length)
    dictionary_path = "category_count.txt"
    term_index = 0  # column of the term in the dictionary text file
    count_index = 1  # column of the term frequency in the dictionary text file

    if not sym_spell.load_dictionary(dictionary_path, term_index, count_index):
        print("Dictionary file not found")
        return

    categorys = open(dictionary_path, 'r')
    d = defaultdict(lambda: 0)
    for x in categorys.readlines():
        z = x.split(' ')
        d[z[0]] = z[2]

    f = open(markdown, 'a')
    f.write('\n## Lista de Exercicios - %s\n' % (input).capitalize())
    input = input.lower()
    suggestion_verbosity = Verbosity.CLOSEST  # TOP, CLOSEST, ALL
    inputs = input.split(' ')
    total_avg = sum(map(len, inputs)) / len(inputs)

    max_edit_distance_lookup = 3 if total_avg > 4 else 2
    for input_term in inputs:
        suggestions = sym_spell.lookup(input_term, suggestion_verbosity,
                                       max_edit_distance_lookup)
        for suggestion in suggestions:
            f.write("* {}, https://a2oj.com/{}".format(
                (suggestion.term).capitalize(), d[suggestion.term]))

    f.close()
    categorys.close()
def check_spelling(content):
    # maximum edit distance per dictionary precalculation
    max_edit_distance_dictionary = 2
    prefix_length = 7
    # create object
    sym_spell = SymSpell(max_edit_distance_dictionary, prefix_length)
    # load dictionary
    dictionary_path = pkg_resources.resource_filename(
        "symspellpy", "frequency_dictionary_en_82_765.txt")
    bigram_path = pkg_resources.resource_filename(
        "symspellpy", "frequency_bigramdictionary_en_243_342.txt")
    # term_index is the column of the term and count_index is the
    # column of the term frequency
    if not sym_spell.load_dictionary(
            dictionary_path, term_index=0, count_index=1):
        print("Dictionary file not found")
        return
    # print("words",sym_spell._words)
    # if not sym_spell.load_bigram_dictionary(dictionary_path, term_index=0,
    #                                         count_index=2):
    #     print("Bigram dictionary file not found")
    #     return

    # result = sym_spell.word_segmentation(content,max_edit_distance=0,max_segmentation_word_length=None,ignore_token=None)
    # display suggestion term, term frequency, and edit distance
    # print("{}, {}, {}".format(result.corrected_string, result.distance_sum,
    #                           result.log_prob_sum))
    # print("corrrectedstring",result.corrected_string)
    doc = nlp(content)
    suggest = {}
    for word in doc:
        print("content", word.text)
        suggestions = sym_spell.lookup(word.text,
                                       Verbosity.TOP,
                                       max_edit_distance=2,
                                       include_unknown=False)
        for suggestion in suggestions:
            if suggestion._distance > 0:
                suggest[word.text] = suggestion._term
                # print("sugg",suggestion,suggestion._term,type(suggestion))

    print(suggest)
    return suggest
def spellcheck(text):
    max_dictionary_edit_distance = 2
    prefix_length = 7
    sym_spell = SymSpell(
        max_dictionary_edit_distance=max_dictionary_edit_distance,
        prefix_length=prefix_length)
    dictionary_path = '6._Ranking/tools_for_spellcheck/frequency_dictionary_en_82_765.txt'
    term_index = 0
    count_index = 1
    if not sym_spell.load_dictionary(dictionary_path, term_index, count_index):
        print("Dictionary file not found")
        return
    final_text = ''
    newtext = expandContractions(text)
    wordlist = nltk.word_tokenize(newtext.lower())
    for item in wordlist:
        if item in '.,:;?!-':
            final_text = final_text + item
        elif item == 'i':
            final_text = final_text + ' ' + item
        elif (item == 'ive'):
            final_text = final_text + ' i have'
        elif (item == 'id'):
            final_text = final_text + ' i would'
        elif (item == 'im'):
            final_text = final_text + ' i am'
        elif (item == 'dont'):
            final_text = final_text + ' do not'
        else:
            input_term = item
            max_edit_distance_lookup = 2
            suggestion_verbosity = Verbosity.TOP  # TOP, CLOSEST, ALL
            suggestions = sym_spell.lookup(input_term, suggestion_verbosity,
                                           max_edit_distance_lookup)
            if len(
                    suggestions
            ) == 0:  #if suggestion not found, then leave as is to avoid deleting words
                final_text = final_text + ' ' + input_term
            else:
                for suggestion in suggestions:
                    final_text = final_text + ' ' + str(suggestion.term)
    return final_text
def main():
    # maximum edit distance per dictionary precalculation
    max_edit_distance_dictionary = 2
    prefix_length = 7
    # create object
    sym_spell = SymSpell(max_edit_distance_dictionary, prefix_length)
    # load dictionary
    dictionary_path = os.path.join(
        os.path.dirname(__file__),
        "/home/raghu/Downloads/frequency_dictionary_en_82_765.txt")
    term_index = 0  # column of the term in the dictionary text file
    count_index = 1  # column of the term frequency in the dictionary text file
    if not sym_spell.load_dictionary(dictionary_path, term_index, count_index):
        print("Dictionary file not found")
        return

    # lookup suggestions for single-word input strings
    input_term = "memebers"  # misspelling of "members"
    # max edit distance per lookup
    # (max_edit_distance_lookup <= max_edit_distance_dictionary)
    max_edit_distance_lookup = 2
    suggestion_verbosity = Verbosity.CLOSEST  # TOP, CLOSEST, ALL
    suggestions = sym_spell.lookup(input_term, suggestion_verbosity,
                                   max_edit_distance_lookup)
    # display suggestion term, term frequency, and edit distance
    for suggestion in suggestions:
        print("{}, {}, {}".format(suggestion.term, suggestion.distance,
                                  suggestion.count))

    # lookup suggestions for multi-word input strings (supports compound
    # splitting & merging)
    input_term = ("AGUDATA OF BIRTH")
    # max edit distance per lookup (per single word, not per whole input string)
    max_edit_distance_lookup = 2
    suggestions = sym_spell.lookup_compound(input_term,
                                            max_edit_distance_lookup)

    # display suggestion term, edit distance, and term frequency
    print(suggestions)
    for suggestion in suggestions:
        print("{}, {}, {}".format(suggestion.term, suggestion.distance,
                                  suggestion.count))
Esempio n. 27
0
    def _symspell(self, sentences):
        """
        SymSpell tool to spelling correction through Symmetric Delete spelling algorithm.

        Reference:
            Author: Wolf Garbe <*****@*****.**>
            Description: https://medium.com/@wolfgarbe/1000x-faster-spelling-correction-algorithm-2012-8701fcd87a5f
            URL: https://github.com/wolfgarbe/symspell

            Python module: symspellpy (https://github.com/mammothb/symspellpy)
        """

        symspell = SymSpell(max_dictionary_edit_distance=self.N)
        symspell.create_dictionary(self.corpus_path)

        with open(self.dictionary_path, "w") as f:
            for key, count in symspell.words.items():
                f.write(f"{key} {count}\n")

        symspell.load_dictionary(self.dictionary_path,
                                 term_index=0,
                                 count_index=1)
        predicts = []

        if not isinstance(sentences, list):
            sentences = [sentences]

        for i in range(len(sentences)):
            split = []

            for x in sentences[i].split():
                sugg = symspell.lookup(
                    x.lower(),
                    verbosity=0,
                    max_edit_distance=self.N,
                    transfer_casing=True
                ) if x not in string.punctuation else None
                split.append(sugg[0].term if sugg else x)

            predicts.append(" ".join(split))

        return predicts
Esempio n. 28
0
def main(argv):
    if len(argv) == 3:
        input = argv[1]
        markdown = argv[2]
    else:
        print ('usage:\n    python .py "<categoria>" <markdown gerado>')
        return
    initial_capacity = 83000
    max_edit_distance_dictionary = 3
    prefix_length = 7
    sym_spell = SymSpell(initial_capacity, max_edit_distance_dictionary, prefix_length)
    dictionary_path = "category_count.txt"
    term_index = 0  # column of the term in the dictionary text file
    count_index = 1  # column of the term frequency in the dictionary text file

    if not sym_spell.load_dictionary(dictionary_path, term_index, count_index):
        print("Dictionary file not found")
        return

    categorys = open(dictionary_path, 'r')
    d = defaultdict(lambda: 0)
    for x in categorys.readlines():
        z = x.split(' ')
        d[z[0]] = z[2]

    f = open(markdown, 'a')
    f.write('\n## Lista de Exercicios - %s\n' % (input).capitalize())
    input = input.lower()
    suggestion_verbosity = Verbosity.CLOSEST  # TOP, CLOSEST, ALL
    inputs = input.split(' ')
    total_avg = sum( map(len, inputs) ) / len(inputs)

    max_edit_distance_lookup = 3 if total_avg > 4 else 2
    for input_term in inputs:
        suggestions = sym_spell.lookup(input_term, suggestion_verbosity, max_edit_distance_lookup)
        for suggestion in suggestions:
            f.write("* {}, https://a2oj.com/{}".format((suggestion.term).capitalize(), d[suggestion.term]))

    f.close()
    categorys.close()
Esempio n. 29
0
File: api.py Progetto: quan97it/data
def process(input_string):
    max_edit_distance_dictionary = 2
    prefix_length = 7
    sym_spell = SymSpell(max_edit_distance_dictionary, prefix_length)
    dictionary_path = pkg_resources.resource_filename(
        "symspellpy", "frequency_dictionary_en_82_765.txt")
    bigram_path = pkg_resources.resource_filename(
        "symspellpy", "frequency_bigramdictionary_en_243_342.txt")
    if not sym_spell.load_dictionary(
            dictionary_path, term_index=0, count_index=1):
        print("Dictionary file not found")
        return
    if not sym_spell.load_bigram_dictionary(
            bigram_path, term_index=0, count_index=2):
        print("Bigram dictionary file not found")
        return

    max_edit_distance_lookup = 2
    suggestion_verbosity = Verbosity.CLOSEST
    suggestions = sym_spell.lookup(input_string, suggestion_verbosity,
                                   max_edit_distance_lookup)
    return list(
        map(lambda sug: (sug.term, sug.distance, sug.count), suggestions))
Esempio n. 30
0
def symspell_test(tokenpos_list: list,
                  ignore_length=2,
                  max_edit_distance_lookup=2,
                  initial_capacity=83000,
                  max_edit_distance_dictionary=2,
                  prefix_length=7,
                  suggestion_verbosity=Verbosity.TOP) -> list:
    """
    keyword arguments are:

    suggestion_verbosity =
    TOP: Top suggestion with smallest edit distance with highest term frequency.
    CLOSEST: All suggestions of smallest edit distance found ordered by frequency.
    ALL: All suggestions within maxEditDistance.

    :return: list of suggested corrections, list of ignored words
    :return: 410 Error: Wrong input type! (Expected list of 2 element tuples)
    """
    try:
        sym_spell = SymSpell(initial_capacity, max_edit_distance_dictionary,
                             prefix_length)
        check_symspell_dictionary(sym_spell)
        suggestion_list = []
        intact_words = []
        for (word, pos) in tokenpos_list:
            if pos == 'PROPN' or len(word) <= ignore_length:
                suggestion_list.append(word)
                intact_words.append(word)
            else:
                suggestions = sym_spell.lookup(word, suggestion_verbosity,
                                               max_edit_distance_lookup)
                suggestion = (list(suggestions))[0]
                suggestion_list.append(suggestion.term)
        return suggestion_list, intact_words
    except (ValueError, TypeError):
        logging.error('Invalid type! Type List of tuples expected as input.')
        return 410
def main():
    dictionary = 'Dictionary_symspell_50_clusters.txt'
    performance_sym = 'performance_sym.txt'
    fout = open(performance_sym, 'w')
    word_list = []
    n = 100
    with open(dictionary, 'r') as f:
        for line in f:
            x = line.strip("\n").split(' ')
            word_list.append(x[0])
    initial_capacity = 600
    max_edit_distance_dictionary = 3
    prefix_length = 7
    sym_spell = SymSpell(initial_capacity, max_edit_distance_dictionary,
                         prefix_length)
    term_index = 0
    count_index = 1
    dictionary_path = os.path.join(os.path.dirname(__file__), dictionary)
    if not sym_spell.load_dictionary(dictionary_path, term_index, count_index):
        print("Dictionary file not found")
        return

    number_of_wrong_char = 2
    number_of_word = 2
    for i in range(n):
        c, w = gen_word(number_of_word, number_of_wrong_char, word_list)
        max_edit_distance_lookup = 2
        suggestion_verbosity = Verbosity.CLOSEST
        input_term = ""
        # print ("correct word : " + c)
        fout.write(c + "\t" + w + "\t")
        suggestions = sym_spell.lookup(w, suggestion_verbosity,
                                       max_edit_distance_lookup)
        for suggestion in suggestions:
            fout.write(suggestion.term + " ")
        fout.write("\n")