Ejemplo n.º 1
0
class SymSpellChecker(object):
    def __init__(self):
        self.checker = SymSpell(max_dictionary_edit_distance=2)
        self.checker.load_dictionary(
            '/home/citao/github/symspellpy/frequency_dictionary_en_82_765.txt',
            0, 1)
        self.checker.load_bigram_dictionary(
            '/home/citao/github/symspellpy/frequency_bigramdictionary_en_243_342.txt',
            0, 2)

    def correct(self, word):
        suggestions = self.checker.lookup(word,
                                          Verbosity.CLOSEST,
                                          max_edit_distance=2)
        for suggestion in suggestions:
            cor_word = suggestion.term
            logging.info('Spell check: [{}] -> [{}]'.format(word, cor_word))
            return cor_word
        return word

    def correct_text(self, text):
        cor_list = []
        for word in text.split(' '):
            suggestions = self.checker.lookup(word,
                                              Verbosity.CLOSEST,
                                              max_edit_distance=2)
            cor_flag = False
            for suggestion in suggestions:
                cor_word = suggestion.term
                cor_list.append(cor_word)
                cor_flag = True
                break
            if not cor_flag:
                cor_list.append(word)
        return ' '.join(cor_list)
Ejemplo n.º 2
0
    def test_lookup_transfer_casing(self):
        sym_spell = SymSpell()
        sym_spell.create_dictionary_entry("steam", 4)
        result = sym_spell.lookup("Stream",
                                  Verbosity.TOP,
                                  2,
                                  transfer_casing=True)
        self.assertEqual("Steam", result[0].term)

        sym_spell = SymSpell()
        sym_spell.create_dictionary_entry("steam", 4)
        result = sym_spell.lookup("StreaM",
                                  Verbosity.TOP,
                                  2,
                                  transfer_casing=True)
        self.assertEqual("SteaM", result[0].term)

        sym_spell = SymSpell()
        sym_spell.create_dictionary_entry("steam", 4)
        result = sym_spell.lookup("STREAM",
                                  Verbosity.TOP,
                                  2,
                                  transfer_casing=True)
        self.assertEqual("STEAM", result[0].term)

        sym_spell = SymSpell()
        sym_spell.create_dictionary_entry("i", 4)
        result = sym_spell.lookup("I", Verbosity.TOP, 2, transfer_casing=True)
        self.assertEqual("I", result[0].term)
Ejemplo n.º 3
0
    def test_words_with_shared_prefix_should_retain_counts(self):
        sym_spell = SymSpell(1, 3)
        sym_spell.create_dictionary_entry("pipe", 5)
        sym_spell.create_dictionary_entry("pips", 10)

        result = sym_spell.lookup("pipe", Verbosity.ALL, 1)
        self.assertEqual(2, len(result))
        self.assertEqual("pipe", result[0].term)
        self.assertEqual(5, result[0].count)
        self.assertEqual("pips", result[1].term)
        self.assertEqual(10, result[1].count)

        result = sym_spell.lookup("pips", Verbosity.ALL, 1)
        self.assertEqual(2, len(result))
        self.assertEqual("pips", result[0].term)
        self.assertEqual(10, result[0].count)
        self.assertEqual("pipe", result[1].term)
        self.assertEqual(5, result[1].count)

        result = sym_spell.lookup("pip", Verbosity.ALL, 1)
        self.assertEqual(2, len(result))
        self.assertEqual("pips", result[0].term)
        self.assertEqual(10, result[0].count)
        self.assertEqual("pipe", result[1].term)
        self.assertEqual(5, result[1].count)
Ejemplo n.º 4
0
 def test_lookup_should_not_return_non_word_delete(self):
     sym_spell = SymSpell(2, 7, 10)
     sym_spell.create_dictionary_entry("pawn", 10)
     result = sym_spell.lookup("paw", Verbosity.TOP, 0)
     self.assertEqual(0, len(result))
     result = sym_spell.lookup("awn", Verbosity.TOP, 0)
     self.assertEqual(0, len(result))
Ejemplo n.º 5
0
 def test_lookup_should_not_return_non_word_delete(self):
     print('  - %s' % inspect.stack()[0][3])
     sym_spell = SymSpell(16, 2, 7, 10)
     sym_spell.create_dictionary_entry("pawn", 10)
     result = sym_spell.lookup("paw", Verbosity.TOP, 0)
     self.assertEqual(0, len(result))
     result = sym_spell.lookup("awn", Verbosity.TOP, 0)
     self.assertEqual(0, len(result))
Ejemplo n.º 6
0
    def test_add_additional_counts_should_increase_count(self):
        sym_spell = SymSpell()
        word = "hello"
        sym_spell.create_dictionary_entry(word, 11)
        result = sym_spell.lookup(word, Verbosity.TOP)
        count = result[0].count if len(result) == 1 else 0
        self.assertEqual(11, count)

        sym_spell.create_dictionary_entry(word, 3)
        result = sym_spell.lookup(word, Verbosity.TOP)
        count = result[0].count if len(result) == 1 else 0
        self.assertEqual(11 + 3, count)
Ejemplo n.º 7
0
    def test_add_additional_counts_should_not_overflow(self):
        sym_spell = SymSpell()
        word = "hello"
        sym_spell.create_dictionary_entry(word, sys.maxsize - 10)
        result = sym_spell.lookup(word, Verbosity.TOP)
        count = result[0].count if len(result) == 1 else 0
        self.assertEqual(sys.maxsize - 10, count)

        sym_spell.create_dictionary_entry(word, 11)
        result = sym_spell.lookup(word, Verbosity.TOP)
        count = result[0].count if len(result) == 1 else 0
        self.assertEqual(sys.maxsize, count)
Ejemplo n.º 8
0
    def test_verbosity_should_control_lookup_results(self):
        sym_spell = SymSpell()
        sym_spell.create_dictionary_entry("steam", 1)
        sym_spell.create_dictionary_entry("steams", 2)
        sym_spell.create_dictionary_entry("steem", 3)

        result = sym_spell.lookup("steems", Verbosity.TOP, 2)
        self.assertEqual(1, len(result))
        result = sym_spell.lookup("steems", Verbosity.CLOSEST, 2)
        self.assertEqual(2, len(result))
        result = sym_spell.lookup("steems", Verbosity.ALL, 2)
        self.assertEqual(3, len(result))
Ejemplo n.º 9
0
    def test_lookup_should_replicate_noisy_results(self):
        print('  - %s' % inspect.stack()[0][3])
        cwd = os.path.realpath(os.path.dirname(__file__))
        dictionary_path = os.path.realpath(
            os.path.join(cwd, pardir, "symspellpy",
                         "frequency_dictionary_en_82_765.txt"))
        query_path = os.path.join(cwd, "fortests", "noisy_query_en_1000.txt")

        edit_distance_max = 2
        prefix_length = 7
        verbosity = Verbosity.CLOSEST
        sym_spell = SymSpell(83000, edit_distance_max, prefix_length)
        sym_spell.load_dictionary(dictionary_path, 0, 1)

        test_list = []
        with open(query_path, "r") as infile:
            for line in infile.readlines():
                line_parts = line.rstrip().split(" ")
                if len(line_parts) >= 2:
                    test_list.append(line_parts[0])
        result_sum = 0
        for phrase in test_list:
            result_sum += len(
                sym_spell.lookup(phrase, verbosity, edit_distance_max))
        self.assertEqual(4945, result_sum)
Ejemplo n.º 10
0
 def test_lookup_should_not_return_low_count_word_that_are_also_delete_word(
         self):
     sym_spell = SymSpell(2, 7, 10)
     sym_spell.create_dictionary_entry("flame", 20)
     sym_spell.create_dictionary_entry("flam", 1)
     result = sym_spell.lookup("flam", Verbosity.TOP, 0)
     self.assertEqual(0, len(result))
Ejemplo n.º 11
0
class SymSpellModel(CandidateModelBase):
    """
    Candidate model based on symspell algorithm.
    https://github.com/wolfgarbe/SymSpell
    """
    def __init__(
        self,
        config: Config,
    ):
        self.sym_spell = SymSpell()
        self.config = config
        self.load_dictionary()

    def load_dictionary(self):
        if not self.config.DICTIONARY_PATH.is_file():
            raise FileNotFoundError("Dictionary doesn't exists")
        self.sym_spell.load_dictionary(self.config.DICTIONARY_PATH,
                                       term_index=0,
                                       count_index=1)

    def get_candidates(self, word: str, n=float("inf")) -> List[str]:
        suggestions = self.sym_spell.lookup(
            word,
            self.config.verbosity,
            max_edit_distance=self.config.max_edit_distance)
        suggested_words = []
        for i, suggestion in enumerate(suggestions):
            if i > n:
                break
            suggested_words.append(suggestion.term)
        return suggested_words
Ejemplo n.º 12
0
 def test_lookup_max_edit_distance_too_large(self):
     sym_spell = SymSpell(2, 7, 10)
     sym_spell.create_dictionary_entry("flame", 20)
     sym_spell.create_dictionary_entry("flam", 1)
     with pytest.raises(ValueError) as excinfo:
         __ = sym_spell.lookup("flam", Verbosity.TOP, 3)
     self.assertEqual("Distance too large", str(excinfo.value))
Ejemplo n.º 13
0
 def test_lookup_include_unknown(self):
     sym_spell = SymSpell(2, 7, 10)
     sym_spell.create_dictionary_entry("flame", 20)
     sym_spell.create_dictionary_entry("flam", 1)
     result = sym_spell.lookup("flam", Verbosity.TOP, 0, True)
     self.assertEqual(1, len(result))
     self.assertEqual("flam", result[0].term)
def spell_checker(df, pickling=False):
    '''Takes a list of document strings and runs all substrings through SymSpell and replaces each
    with correctly spelled string using the dictionary (max Levenshtein distance=2).
    '''

    df = df[df["Full review"].notna()]
    
    sym_spell = SymSpell(max_dictionary_edit_distance=2, prefix_length=7)
    sym_spell.load_dictionary("frequency_dictionary_en_82_765.txt", term_index=0, count_index=1)
    
    reviews = list(df["Full review"])

    cleaned_reviews = []
    for review in tqdm(reviews):
        cleaned_review = []
        for word in review.split():
            if len(word) > 5:
                word = sym_spell.lookup(word, Verbosity.CLOSEST, max_edit_distance=2, include_unknown=True, \
                    transfer_casing=True, ignore_token='([A-z]+)-([A-z]+)')[0]._term
            cleaned_review.append(word)
        cleaned_reviews.append(" ".join(cleaned_review))

    df["Spell-checked review"] = cleaned_reviews
    
    if pickling==True:
    
        with open("spell_checked_data.pickle", "wb") as to_write:
            pickle.dump(df, to_write)

    return df
Ejemplo n.º 15
0
def f_typo(w_list):
    """
    :param w_list: word list to be processed
    :return: w_list with typo fixed by symspell. words with no match up will be dropped
    """
    sym_spell = SymSpell(max_dictionary_edit_distance=3, prefix_length=7)
    dictionary_path = pkg_resources.resource_filename(
        "symspellpy", "frequency_dictionary_en_82_765.txt")
    if sym_spell.word_count:
        pass
    else:
        sym_spell.load_dictionary(dictionary_path, term_index=0, count_index=1)

    w_list_fixed = []
    for word in w_list:
        suggestions = sym_spell.lookup(word,
                                       Verbosity.CLOSEST,
                                       max_edit_distance=3)
        if suggestions:
            w_list_fixed.append(suggestions[0].term)
        else:
            pass
            # do word segmentation, deprecated for inefficiency
            # w_seg = sym_spell.word_segmentation(phrase=word)
            # w_list_fixed.extend(w_seg.corrected_string.split())
    return w_list_fixed
Ejemplo n.º 16
0
    def test_delete_dictionary_entry_invalid_word(self):
        sym_spell = SymSpell()
        sym_spell.create_dictionary_entry("stea", 1)
        sym_spell.create_dictionary_entry("steama", 2)
        sym_spell.create_dictionary_entry("steem", 3)

        result = sym_spell.lookup("steama", Verbosity.TOP, 2)
        self.assertEqual(1, len(result))
        self.assertEqual("steama", result[0].term)
        self.assertEqual(len("steama"), sym_spell._max_length)

        self.assertFalse(sym_spell.delete_dictionary_entry("steamab"))
        result = sym_spell.lookup("steama", Verbosity.TOP, 2)
        self.assertEqual(1, len(result))
        self.assertEqual("steama", result[0].term)
        self.assertEqual(len("steama"), sym_spell._max_length)
Ejemplo n.º 17
0
 def test_lookup_include_unknown(self):
     print('  - %s' % inspect.stack()[0][3])
     sym_spell = SymSpell(16, 2, 7, 10)
     sym_spell.create_dictionary_entry("flame", 20)
     sym_spell.create_dictionary_entry("flam", 1)
     result = sym_spell.lookup("qwer", Verbosity.TOP, 0, True)
     self.assertEqual(1, len(result))
     self.assertEqual("qwer", result[0].term)
Ejemplo n.º 18
0
 def test_lookup_should_find_exact_match(self):
     sym_spell = SymSpell()
     sym_spell.create_dictionary_entry("steama", 4)
     sym_spell.create_dictionary_entry("steamb", 6)
     sym_spell.create_dictionary_entry("steamc", 2)
     result = sym_spell.lookup("streama", Verbosity.TOP, 2)
     self.assertEqual(1, len(result))
     self.assertEqual("steama", result[0].term)
Ejemplo n.º 19
0
 def test_lookup_should_return_most_frequent(self):
     sym_spell = SymSpell()
     sym_spell.create_dictionary_entry("steama", 4)
     sym_spell.create_dictionary_entry("steamb", 6)
     sym_spell.create_dictionary_entry("steamc", 2)
     result = sym_spell.lookup("stream", Verbosity.TOP, 2)
     self.assertEqual(1, len(result))
     self.assertEqual("steamb", result[0].term)
     self.assertEqual(6, result[0].count)
Ejemplo n.º 20
0
 def test_lookup_avoid_exact_match_early_exit(self):
     edit_distance_max = 2
     sym_spell = SymSpell(edit_distance_max, 7, 10)
     sym_spell.create_dictionary_entry("flame", 20)
     sym_spell.create_dictionary_entry("flam", 1)
     result = sym_spell.lookup("24th", Verbosity.ALL, edit_distance_max,
                               ignore_token=r"\d{2}\w*\b")
     self.assertEqual(1, len(result))
     self.assertEqual("24th", result[0].term)
Ejemplo n.º 21
0
 def test_deletes(self):
     sym_spell = SymSpell()
     sym_spell.create_dictionary_entry("steama", 4)
     sym_spell.create_dictionary_entry("steamb", 6)
     sym_spell.create_dictionary_entry("steamc", 2)
     result = sym_spell.lookup("stream", Verbosity.TOP, 2)
     self.assertEqual(1, len(result))
     self.assertEqual("steamb", result[0].term)
     self.assertEqual(6, result[0].count)
     self.assertTrue(len(sym_spell.deletes))
Ejemplo n.º 22
0
    def test_load_dictionary_encoding(self):
        dictionary_path = os.path.join(self.fortests_path, "non_en_dict.txt")

        edit_distance_max = 2
        prefix_length = 7
        sym_spell = SymSpell(edit_distance_max, prefix_length)
        sym_spell.load_dictionary(dictionary_path, 0, 1, encoding="utf-8")

        result = sym_spell.lookup("АБ", Verbosity.TOP, 2)
        self.assertEqual(1, len(result))
        self.assertEqual("АБИ", result[0].term)
Ejemplo n.º 23
0
class NameChecker(object):
	def __init__(self, name_list):
		self.sym_spell = SymSpell(max_dictionary_edit_distance=2, prefix_length=7)
		
		for each_name in name_list:
			self.sym_spell.create_dictionary_entry(each_name, len(each_name.split(' ')))
		
	def get_name(self, name):
		suggestions = self.sym_spell.lookup(name, Verbosity.CLOSEST, max_edit_distance=2, transfer_casing=True)
		
		if suggestions is not None and len(suggestions) > 0:
			return suggestions[0].term
		
		return name
Ejemplo n.º 24
0
    def test_load_dictionary_encoding(self):
        print('  - %s' % inspect.stack()[0][3])
        cwd = os.path.realpath(os.path.dirname(__file__))
        dictionary_path = os.path.realpath(
            os.path.join(cwd, "fortests", "non_en_dict.txt"))

        edit_distance_max = 2
        prefix_length = 7
        sym_spell = SymSpell(83000, edit_distance_max, prefix_length)
        sym_spell.load_dictionary(dictionary_path, 0, 1, encoding="utf-8")

        result = sym_spell.lookup("АБ", Verbosity.TOP, 2)
        self.assertEqual(1, len(result))
        self.assertEqual("АБИ", result[0].term)
Ejemplo n.º 25
0
    def symspell_correction(
            misspelled):  # not used because it is too expensive
        from symspellpy import SymSpell, Verbosity

        sym_spell = SymSpell(83000, 2)
        dictionary_path = resdir + "frequency_dictionary_en_82_765.txt"
        if not sym_spell.load_dictionary(dictionary_path, 0, 1):
            return ""
        suggestions = sym_spell.lookup(misspelled, Verbosity.CLOSEST, 2)
        if suggestions:
            return sorted(suggestions, key=lambda x: x.count,
                          reverse=True)[0].term
        return sorted(sym_spell.lookup_compound(misspelled, 2),\
                      key = lambda x: x.count,\
                      reverse = True)[0].term
Ejemplo n.º 26
0
def process_comments(comments_column):
    # Apostrophe expansion
    comments_column = comments_column.apply(lambda x: x.replace("’", "'"))
    comments_column = comments_column.apply(lambda x: expandContractions(x))
    # Lowercase tweets
    comments_column = comments_column.apply(lambda x: x.lower())
    # Remove url, hashtags, cashtags, twitter handles, and RT. Only words
    comments_column = comments_column.apply(lambda x: ' '.join(
        re.sub(
            r"(@[A-Za-z0-9]+)|^rt |(#[A-Za-z0-9]+) |(\w+:\/*\S+)|[^a-zA-Z\s]",
            "", x).split()))
    # Remove url token
    comments_column = comments_column.apply(lambda x: x.replace('url', ''))
    # Lemmatisation
    tokeniser = TweetTokenizer()
    wordnet_lemmatizer = WordNetLemmatizer()
    comments_column = comments_column.apply(
        lambda x: [word for word in tokeniser.tokenize(x)])
    sym_spell = SymSpell()
    dictionary_path = pkg_resources.resource_filename(
        "symspellpy", "frequency_dictionary_en_82_765.txt")
    sym_spell.load_dictionary(dictionary_path, 0, 1)
    # spell_checkers.create_dictionary("eng_dict.txt")
    print("Spell checker...")
    for i in range(len(comments_column)):
        try:
            if i == (len(comments_column) - 1) or i % 10000 == 0:
                print('%i out of %i' % (i, len(comments_column)))
            for j in range(len(comments_column[i])):
                suggestions = sym_spell.lookup(comments_column[i][j],
                                               Verbosity.CLOSEST,
                                               max_edit_distance=2)
                # suggestions = spell_checkers.get_suggestions(comments_column[i][j])
                if suggestions:
                    best_sugg = str(suggestions[0].split(',')[0].strip())
                    # best_sugg = str(suggestions[0])
                    comments_column[i][j] = best_sugg
        except:
            continue

    comments_column = comments_column.apply(lambda x: ' '.join(
        [wordnet_lemmatizer.lemmatize(word, pos="v") for word in x]))

    return comments_column
Ejemplo n.º 27
0
 def useSymspell(self):
     self.originalText, self.errorText = FP().prepareFiles()
     originalSentencesList, errorSentencesList = EC().textToSentences(
         self.originalText, self.errorText)
     print(len(originalSentencesList), len(errorSentencesList))
     speller = SymSpell()
     corpusPath = FP().definePathToCoprus()
     speller.create_dictionary(corpusPath, encoding='utf-8')
     processedWordsList = []
     for sentence in errorSentencesList:
         sentenceWords = EC().sentencesToWords(sentence)
         for word in sentenceWords:
             suggestions = speller.lookup(word,
                                          Verbosity.CLOSEST,
                                          max_edit_distance=2,
                                          include_unknown=True)
             for suggestion in suggestions:
                 processedWordsList.append(suggestion.term)
                 break
     print(len(processedWordsList))
     self.useWordsMetrics(self.originalText, processedWordsList)
    def spell_checker(self):
        '''Takes a list of document strings and runs all substrings through SymSpell and replaces each
        with correctly spelled string using the dictionary (max Levenshtein distance=2).
        '''

        sym_spell = SymSpell(max_dictionary_edit_distance=2, prefix_length=7)
        sym_spell.load_dictionary("frequency_dictionary_en_82_765.txt", term_index=0, count_index=1)
        
        reviews = self.reviews

        cleaned_reviews = []
        for review in tqdm(list_of_reviews):
            cleaned_review = []
            for word in tqdm(review.split()):
                if len(word) > 5:
                    word = sym_spell.lookup(word, Verbosity.CLOSEST, max_edit_distance=2, include_unknown=True, \
                        transfer_casing=True, ignore_token='([A-z]+)-([A-z]+)')[0]._term
                cleaned_review.append(word)
            cleaned_reviews.append(" ".join(cleaned_review))

        self.reviews = cleaned_reviews
Ejemplo n.º 29
0
    def test_lookup_should_replicate_noisy_results(self):
        query_path = os.path.join(self.fortests_path,
                                  "noisy_query_en_1000.txt")

        edit_distance_max = 2
        prefix_length = 7
        verbosity = Verbosity.CLOSEST
        sym_spell = SymSpell(edit_distance_max, prefix_length)
        sym_spell.load_dictionary(self.dictionary_path, 0, 1)

        test_list = []
        with open(query_path, "r") as infile:
            for line in infile.readlines():
                line_parts = line.rstrip().split(" ")
                if len(line_parts) >= 2:
                    test_list.append(line_parts[0])
        result_sum = 0
        for phrase in test_list:
            result_sum += len(
                sym_spell.lookup(phrase, verbosity, edit_distance_max))
        self.assertEqual(4945, result_sum)
Ejemplo n.º 30
0
    def replace(self):
        ''' Generates a new text file by correcting the spellings'''
        sym_spell = SymSpell(max_dictionary_edit_distance=self.maxd,
                             prefix_length=self.prefix_len)
        dictionary_path = pkg_resources.resource_filename(
            "symspellpy", "frequency_dictionary_en_82_765.txt")
        # term_index is the column of the term and count_index is the
        # column of the term frequency
        sym_spell.load_dictionary(dictionary_path, term_index=0, count_index=1)

        txtfile_corrected = self.txtfile
        for line in txtfile_corrected:
            for word in line:
                if (word.isalpha() and len(word) > 1):
                    input_term = word
                    suggestions = sym_spell.lookup(input_term,
                                                   Verbosity.TOP,
                                                   max_edit_distance=self.maxd,
                                                   transfer_casing=True,
                                                   include_unknown=True)
                    line[line.index(word)] = suggestions[0].term
        write_file(self.new_name, txtfile_corrected)