def main():
    initial_capacity = 83000
    # maximum edit distance per dictionary precalculation
    max_edit_distance_dictionary = 0
    prefix_length = 7
    sym_spell = SymSpell(initial_capacity, max_edit_distance_dictionary,
                         prefix_length)
    # load dictionary
    dictionary_path = os.path.join(
        os.path.dirname(__file__), "./data/frequency_dictionary_en_82_765.txt")
    term_index = 0  # column of the term in the dictionary text file
    count_index = 1  # column of the term frequency in the dictionary text file
    if not sym_spell.load_dictionary(dictionary_path, term_index, count_index):
        print("Dictionary file not found")
        return

    edit_distance_max = 0
    prefix_length = 7
    sym_spell = SymSpell(83000, edit_distance_max, prefix_length)
    sym_spell.load_dictionary(dictionary_path, 0, 1)

    typo = "thequickbrownfoxjumpsoverthelazydog"
    correction = "the quick brown fox jumps over the lazy dog"
    result = sym_spell.word_segmentation(typo)  # create object

    # a sentence without any spaces
    input_term = "thequickbrownfoxjumpsoverthelazydog"
    result = sym_spell.word_segmentation(input_term)
    # display suggestion term, term frequency, and edit distance
    print("{}, {}, {}".format(result.corrected_string, result.distance_sum,
                              result.log_prob_sum))
def initializeSymspell():
    print("inside initializeSymspell()")
    symspell = SymSpell(max_dictionary_edit_distance=2, prefix_length=7)
    print("symspell created")
    resourceNames = [
        "symspellpy", "frequency_dictionary_en_82_765.txt",
        "frequency_bigramdictionary_en_243_342.txt"
    ]
    dictionaryPath = pkg_resources.resource_filename(resourceNames[0],
                                                     resourceNames[1])
    bigramPath = pkg_resources.resource_filename(resourceNames[0],
                                                 resourceNames[2])
    print("dictionaryPath created")
    symspell.load_dictionary(dictionaryPath, 0, 1)
    symspell.create_dictionary_entry(key='ap', count=500000000)
    print(list(islice(symspell.words.items(), 5)))
    print("symspell.load_ditionary() done")
    symspell.load_bigram_dictionary(bigramPath, 0, 1)
    print(list(islice(symspell.bigrams.items(), 5)))
    print("symspell.load_bigram_ditionary() done")

    # Create vocab
    vocab = set([w for w, f in symspell.words.items()])

    return symspell, vocab
Beispiel #3
0
class SymSpellCorrection:
    """
        Use SymSpell for correction
    """
    def __init__(self, dictionary_path, term_index=0, count_index=1, max_edit_distance_dictionary=0, prefix_length=7, **args):
        """
        Input:
            - dictionary_path: string
            - term_index: int, column of the term in the dictionary text file, default is 0
            - count_index: int, column of the term frequency in the dictionary text file, default is 1
            - max_edit_distance_dictionary: int, maximum edit distance per dictionary precalculation, default is 0
            - prefix_length, int, default is 7
        """
        from symspellpy.symspellpy import SymSpell
        self.sym_spell = SymSpell(max_edit_distance_dictionary, prefix_length)
        self.sym_spell.load_dictionary(dictionary_path, term_index, count_index)

    def __call__(self, sentence):
        """
            Input:
                - sentence: string

            Output:
                - string
        """
        if len(sentence) < 1:
            return sentence
        try:
            corrected = self.sym_spell.word_segmentation(sentence).corrected_string
        except:
            print("Error spell correction:", sentence)
            corrected = sentence
        return corrected
Beispiel #4
0
    def _create_symspell_checker(self, language: AnyStr) -> SymSpell:
        """Private method to create a SymSpell instance for a given language

        Args:
            language: Language code in ISO 639-1 format

        Returns:
            SymSpell checker instance loaded with the language dictionary

        """
        start = perf_counter()
        logging.info(f"Loading spellchecker for language '{language}'...")
        symspell_checker = SymSpell(
            max_dictionary_edit_distance=self.edit_distance)
        frequency_dict_path = self.dictionary_folder_path + "/" + language + ".txt"
        symspell_checker.load_dictionary(frequency_dict_path,
                                         term_index=0,
                                         count_index=1,
                                         encoding="utf-8")
        if len(self.custom_vocabulary_set) != 0:
            for word in self.custom_vocabulary_set:
                symspell_checker.create_dictionary_entry(key=word, count=1)
        logging.info(
            f"Loading spellchecker for language '{language}': done in {perf_counter() - start:.2f} seconds"
        )
        return symspell_checker
Beispiel #5
0
def symspell(max_edit_distance_dictionary: int = 2,
             prefix_length: int = 7,
             term_index: int = 0,
             count_index: int = 1,
             top_k: int = 10,
             **kwargs):
    """
    Train a symspell Spell Corrector.

    Returns
    -------
    result: malaya.spell.SYMSPELL class
    """

    check_file(PATH_NGRAM['symspell'], S3_PATH_NGRAM['symspell'], **kwargs)
    check_file(PATH_NGRAM[1], S3_PATH_NGRAM[1], **kwargs)

    try:
        from symspellpy.symspellpy import SymSpell, Verbosity
    except:
        raise Exception(
            'symspellpy not installed. Please install it and try again.')
    sym_spell = SymSpell(max_edit_distance_dictionary, prefix_length)
    dictionary_path = PATH_NGRAM['symspell']['model']
    sym_spell.load_dictionary(dictionary_path, term_index, count_index)
    with open(PATH_NGRAM[1]['model']) as fopen:
        corpus = json.load(fopen)
    return SYMSPELL(sym_spell, Verbosity.ALL, corpus, k=top_k)
Beispiel #6
0
def main():
    # maximum edit distance per dictionary precalculation
    max_edit_distance_dictionary = 2
    prefix_length = 7
    # create object
    sym_spell = SymSpell(max_edit_distance_dictionary, prefix_length)
    # load dictionary
    dictionary_path = os.path.join(os.path.dirname(__file__),
                                   "frequency_dictionary_en_82_765.txt")
    term_index = 0  # column of the term in the dictionary text file
    count_index = 1  # column of the term frequency in the dictionary text file
    if not sym_spell.load_dictionary(dictionary_path, term_index, count_index):
        print("Dictionary file not found")
        return

    sym_spell.load_dictionary(
        "/home/yadi/projectDISK/Python-Projects/ML-NLP/dictionary.txt", 0, 1)

    # lookup suggestions for multi-word input strings (supports compound
    # splitting & merging)
    input_term = ("whereis th elove hehad dated forImuch of thepast who "
                  "couqdn'tread in sixtgrade and ins pired him."
                  "I'm workig in th e yadolah shahrary working in githib")
    # max edit distance per lookup (per single word, not per whole input string)
    max_edit_distance_lookup = 1
    suggestions = sym_spell.lookup_compound(input_term,
                                            max_edit_distance_lookup,
                                            transfer_casing=True)
    # display suggestion term, edit distance, and term frequency
    print(input_term)
    for suggestion in suggestions:
        print("{}".format(suggestion.term))
Beispiel #7
0
def load_spell_checker():
    """Return spell checker"""
    if not os.path.exists("data/unigrams.txt"):
        sents = [normalize_text(" ".join(x)).split() for x in floresta.sents()]
        sents += [normalize_text(" ".join(x)).split() for x in machado.sents()]
        sents += [
            normalize_text(" ".join(x)).split() for x in mac_morpho.sents()
        ]

        unigrams = [item for sublist in sents for item in sublist]
        unigrams = nltk.probability.FreqDist(unigrams)

        file = open("data/unigrams.txt", "w")
        for k, v in unigrams.items():
            file.write(f"{k} {v}\n")
        file.close()

        bigrams = []

        for sent in sents:
            bigrams += list(nltk.bigrams(sent))

        bigrams = nltk.probability.FreqDist(bigrams)

        file = open("data/bigrams.txt", "w")
        for k, v in bigrams.items():
            file.write(f"{' '.join(k)} {v}\n")
        file.close()

    result = SymSpell()

    result.load_dictionary("data/unigrams.txt", 0, 1)
    result.load_bigram_dictionary("data/bigrams.txt", 0, 2)

    return result
Beispiel #8
0
def symspell(max_edit_distance_dictionary: int = 2,
             prefix_length: int = 7,
             term_index: int = 0,
             count_index: int = 1,
             top_k: int = 10,
             **kwargs):
    """
    Load a symspell Spell Corrector for Malay.

    Returns
    -------
    result: malaya.spell.Symspell class
    """

    try:
        from symspellpy.symspellpy import SymSpell, Verbosity
    except BaseException:
        raise ModuleNotFoundError(
            'symspellpy not installed. Please install it and try again.')

    path = check_file(PATH_NGRAM['symspell'], S3_PATH_NGRAM['symspell'],
                      **kwargs)
    sym_spell = SymSpell(max_edit_distance_dictionary, prefix_length)
    sym_spell.load_dictionary(path['model'], term_index, count_index)

    path = check_file(PATH_NGRAM[1], S3_PATH_NGRAM[1], **kwargs)
    with open(path['model']) as fopen:
        corpus = json.load(fopen)
    return Symspell(sym_spell, Verbosity.ALL, corpus, k=top_k)
Beispiel #9
0
    def test_lookup_should_replicate_noisy_results(self):
        print('  - %s' % inspect.stack()[0][3])
        cwd = path.realpath(path.dirname(__file__))
        dictionary_path = path.realpath(
            path.join(cwd, pardir, "symspellpy",
                      "frequency_dictionary_en_82_765.txt"))
        query_path = path.join(cwd, "fortests", "noisy_query_en_1000.txt")

        edit_distance_max = 2
        prefix_length = 7
        verbosity = Verbosity.CLOSEST
        sym_spell = SymSpell(83000, edit_distance_max, prefix_length)
        sym_spell.load_dictionary(dictionary_path, 0, 1)

        test_list = []
        with open(query_path, "r") as infile:
            for line in infile.readlines():
                line_parts = line.rstrip().split(" ")
                if len(line_parts) >= 2:
                    test_list.append(line_parts[0])
        result_sum = 0
        for phrase in test_list:
            result_sum += len(
                sym_spell.lookup(phrase, verbosity, edit_distance_max))
        self.assertEqual(4945, result_sum)
def createSymSpell(dict='ru-100k.txt', encoding='utf-8'):
    symspell = SymSpell(max_dictionary_edit_distance=2, prefix_length=5)
    symspell.load_dictionary(dict,
                             encoding=encoding,
                             term_index=0,
                             count_index=1)
    return symspell
Beispiel #11
0
def load_symspell(dict_path='symspell/frequency_dictionary_en_82_765.txt',
                  max_edit_distance_dictionary=2,
                  prefix_length=7,
                  term_index=0,
                  count_index=1):
    sym_spell = SymSpell(max_edit_distance_dictionary, prefix_length)
    sym_spell.load_dictionary(dict_path, term_index, count_index)
    return sym_spell
def symspell_checker(text):
    from symspellpy.symspellpy import SymSpell
    spell = SymSpell()
    spell.load_dictionary(r"frequency_dictionary_en_82_765.txt", 0, 1)
    spell.load_bigram_dictionary(r"frequency_bigramdictionary_en_243_342.txt", 0, 2)
    result = spell.lookup_compound(text, 2)
    for r in result:
        return r.term
    return text    
Beispiel #13
0
def symspell(
    validate=True,
    max_edit_distance_dictionary=2,
    prefix_length=7,
    term_index=0,
    count_index=1,
    top_k=10,
):
    """
    Train a symspell Spell Corrector.

    Parameters
    ----------
    validate: bool, optional (default=True)
        if True, malaya will check model availability and download if not available.

    Returns
    -------
    _SpellCorrector: malaya.spell._SymspellCorrector class
    """
    if not isinstance(validate, bool):
        raise ValueError('validate must be a boolean')
    if not isinstance(max_edit_distance_dictionary, int):
        raise ValueError('max_edit_distance_dictionary must be an integer')
    if not isinstance(prefix_length, int):
        raise ValueError('prefix_length must be an integer')
    if not isinstance(term_index, int):
        raise ValueError('term_index must be an integer')
    if not isinstance(count_index, int):
        raise ValueError('count_index must be an integer')

    if validate:
        check_file(PATH_NGRAM['symspell'], S3_PATH_NGRAM['symspell'])
        check_file(PATH_NGRAM[1], S3_PATH_NGRAM[1])
    else:
        if not check_available(PATH_NGRAM['symspell']):
            raise Exception(
                'preprocessing is not available, please `validate = True`')
        if not check_available(PATH_NGRAM[1]):
            raise Exception(
                'preprocessing is not available, please `validate = True`')
    try:
        from symspellpy.symspellpy import SymSpell, Verbosity
    except:
        raise Exception(
            'symspellpy not installed. Please install it and try again.')
    sym_spell = SymSpell(max_edit_distance_dictionary, prefix_length)
    dictionary_path = PATH_NGRAM['symspell']['model']
    sym_spell.load_dictionary(dictionary_path, term_index, count_index)
    with open(PATH_NGRAM[1]['model']) as fopen:
        corpus = json.load(fopen)
    return _SymspellCorrector(sym_spell, Verbosity.ALL, corpus, k=top_k)
Beispiel #14
0
 def __new__(cls):
     if cls._instance is None:
         ##Symspell configuration
         max_edit_distance_dictionary= 3
         prefix_length = 4
         spellchecker = SymSpell(max_edit_distance_dictionary, prefix_length)
         dictionary_path = pkg_resources.resource_filename(
             "symspellpy", "frequency_dictionary_en_82_765.txt")
         bigram_path = pkg_resources.resource_filename(
             "symspellpy", "frequency_bigramdictionary_en_243_342.txt")
         spellchecker.load_dictionary(dictionary_path, term_index=0, count_index=1)
         spellchecker.load_bigram_dictionary(dictionary_path, term_index=0, count_index=2)
         cls._instance=spellchecker
     return cls._instance
Beispiel #15
0
    def test_lookup_compound_ignore_non_words(self):
        print('  - %s' % inspect.stack()[0][3])
        cwd = path.realpath(path.dirname(__file__))
        dictionary_path = path.realpath(
            path.join(cwd, pardir, "symspellpy",
                      "frequency_dictionary_en_82_765.txt"))

        edit_distance_max = 2
        prefix_length = 7
        sym_spell = SymSpell(83000, edit_distance_max, prefix_length)
        sym_spell.load_dictionary(dictionary_path, 0, 1)

        typo = ("whereis th elove 123 hehad dated forImuch of THEPAST who "
                "couqdn'tread in SIXTHgrade and ins pired him")
        correction = ("where is the love 123 he had dated for much of THEPAST "
                      "who couldn't read in sixth grade and inspired him")
        results = sym_spell.lookup_compound(typo, edit_distance_max, True)
        self.assertEqual(1, len(results))
        self.assertEqual(correction, results[0].term)

        typo = "in te DHIRD 1 qarter oflast jear he hadlearned ofca sekretplan"
        correction = ("in the DHIRD 1 quarter of last year he had learned "
                      "of a secret plan")
        results = sym_spell.lookup_compound(typo, edit_distance_max, True)
        self.assertEqual(1, len(results))
        self.assertEqual(correction, results[0].term)

        typo = ("the bigjest playrs in te stroGSOmmer film slatew ith PLETY "
                "of 12 funn")
        correction = ("the biggest players in the strong summer film slate "
                      "with PLETY of 12 fun")
        results = sym_spell.lookup_compound(typo, edit_distance_max, True)
        self.assertEqual(1, len(results))
        self.assertEqual(correction, results[0].term)

        typo = ("Can yu readtHIS messa ge despite thehorible 1234 "
                "sppelingmsitakes")
        correction = ("can you read this message despite the horrible 1234 "
                      "spelling mistakes")
        results = sym_spell.lookup_compound(typo, edit_distance_max, True)
        self.assertEqual(1, len(results))
        self.assertEqual(correction, results[0].term)

        typo = "PI on leave, arrange Co-I to do screening"
        correction = "PI on leave arrange co i to do screening"
        results = sym_spell.lookup_compound(typo, edit_distance_max, True)
        self.assertEqual(1, len(results))
        self.assertEqual(correction, results[0].term)
Beispiel #16
0
def correct_spelling(sentence):
    # maximum edit distance per dictionary precalculation
    max_edit_distance_dictionary = 2
    prefix_length = 5
    # create object
    sym_spell = SymSpell(max_edit_distance_dictionary, prefix_length)
    # load dictionary
    dictionary_path = os.path.join(os.path.dirname(__file__),
                                   "frequency_dictionary_en_82_765.txt")
    term_index = 0  # column of the term in the dictionary text file
    count_index = 1  # column of the term frequency in the dictionary text file
    if not sym_spell.load_dictionary(dictionary_path, term_index, count_index):
        print("Dictionary file not found")
        return
    if "&amp ;" in sentence:
        sentence = sentence.replace("&amp ;", "and")
    max_edit_distance_lookup = 2
    suggestions = sym_spell.lookup_compound(sentence, max_edit_distance_lookup)
    save = ""
    for suggestion in suggestions:
        save = suggestion.term
        #print("{}".format(save))
        break

    #if "#" in save:
    #    save = sym_spell.word_segmentation(save)

    return save
Beispiel #17
0
def spelling_preprocessor():
    import os
    from symspellpy.symspellpy import SymSpell, Verbosity

    max_edit_distance_dictionary = 2
    prefix_length = 7

    sc = SymSpell(max_edit_distance_dictionary, prefix_length)
    dictionary_path = os.path.join(os.getenv('HOME'), 'symspellpy/symspellpy/frequency_dictionary_en_82_765.txt')
    term_index = 0
    count_index = 1

    if not sc.load_dictionary(dictionary_path, term_index, count_index):
        raise ImportError('Unable to load spelling dictionary')

    max_edit_distance_lookup = 2
    suggestion_verbosity = Verbosity.CLOSEST

    @string_check
    def checker(s):
        words = s.split()
        corrected_words = list()

        for word in words:
            correction = sc.lookup(word, suggestion_verbosity, max_edit_distance_lookup)
            if correction:
                corrected_words.append(correction[0].term)
            else:
                corrected_words.append(word)
        return ' '.join(corrected_words)
    return checker
def main():
    # maximum edit distance per dictionary precalculation
    max_edit_distance_dictionary = 2
    prefix_length = 7

    # create object
    sym_spell = SymSpell(max_edit_distance_dictionary, prefix_length)

    # load dictionary
    dictionary_path = os.path.join(os.path.dirname(__file__),
                                   "corpus/dictionary/dictionary.txt")
    # dictionary_path = os.path.join(os.path.dirname(__file__), "corpus/symspellpy/frequency_dictionary_en_82_765.txt")
    term_index = 0  # column of the term in the dictionary text file
    count_index = 1  # column of the term frequency in the dictionary text file
    if not sym_spell.load_dictionary(dictionary_path, term_index, count_index):
        print("Dictionary file not found")
        return

    # lookup suggestions for single-word input strings
    input_term = "bangeeet"  # misspelling
    # max edit distance per lookup
    # (max_edit_distance_lookup <= max_edit_distance_dictionary)
    max_edit_distance_lookup = 2
    suggestion_verbosity = Verbosity.CLOSEST  # TOP, CLOSEST, ALL
    suggestions = sym_spell.lookup(input_term, suggestion_verbosity,
                                   max_edit_distance_lookup)
    # display suggestion term, term frequency, and edit distance
    for suggestion in suggestions:
        print("{}, {}, {}".format(suggestion.term, suggestion.distance,
                                  suggestion.count))
def spelling_correction(data,column):
    from symspellpy.symspellpy import SymSpell , Verbosity
    # maximum edit distance per dictionary precalculation
    max_edit_distance_dictionary = 2
    prefix_length = 7
    # create object
    sym_spell = SymSpell(max_edit_distance_dictionary, prefix_length)
    # load dictionary
    dictionary_path = "frequency_dictionary_en_82_765.txt"
    term_index = 0  # column of the term in the dictionary text file
    count_index = 1  # column of the term frequency in the dictionary text file
    if not sym_spell.load_dictionary(dictionary_path, term_index, count_index):
       print("Dictionary file not found")

    max_edit_distance_lookup = 2
    suggestion_verbosity = Verbosity.CLOSEST  # TOP, CLOSEST, ALL
    df_final = pd.DataFrame()
    for index , row in data.iterrows():
        # lookup suggestions for single-word input strings
        text = row[column]
        # max edit distance per lookup
        # (max_edit_distance_lookup <= max_edit_distance_dictionary)
        for input_term in text.split():
            suggestions = sym_spell.lookup(input_term, suggestion_verbosity,
                                       max_edit_distance_lookup)
            if len(suggestions)>0:
                df_local = pd.DataFrame({'Original Word':[input_term],'Replacement':[suggestions[0].term]})        
                df_final = df_final.append(df_local)
    return df_final
Beispiel #20
0
def setup(initial_capacity=83000,
          prefix_length=7,
          max_edit_distance_dictionary=2):

    global maximum_edit_distance
    maximum_edit_distance = max_edit_distance_dictionary

    dict_path = '/home/fa6/data/symspellpy/frequency_dictionary_en_82_765.txt'
    sym_spell = SymSpell(initial_capacity,
                         max_edit_distance_dictionary,
                         prefix_length,
                         count_threshold=30)

    term_index = 0  # column of the term in the dictionary text file
    count_index = 1  # column of the term frequency in the dictionary text file

    if not sym_spell.load_dictionary(dict_path, term_index, count_index):
        print("Dictionary file not found")
        return

    # lookup suggestions for single-word input strings
    # input_term = "memebers"  # misspelling of "members"
    # max edit distance per lookup
    # (max_edit_distance_lookup <= max_edit_distance_dictionary)
    # max_edit_distance_lookup = 2
    # suggestion_verbosity = Verbosity.CLOSEST  # TOP, CLOSEST, ALL
    # suggestions = sym_spell.lookup(input_term, suggestion_verbosity,
    #                                max_edit_distance_lookup)
    # # display suggestion term, term frequency, and edit distance
    # for suggestion in suggestions:
    #     print("{}, {}, {}".format(suggestion.term, suggestion.count,
    #                               suggestion.distance))

    return sym_spell
def main():
    # maximum edit distance per dictionary precalculation
    max_edit_distance_dictionary = 3
    prefix_length = 7
    # create object
    sym_spell = SymSpell(max_edit_distance_dictionary, prefix_length)
    # load dictionary
    dictionary_path = os.path.join(os.path.dirname(__file__),
                                   "frequency_dictionary_en_82_765.txt")
    term_index = 0  # column of the term in the dictionary text file
    count_index = 1  # column of the term frequency in the dictionary text file
    if not sym_spell.load_dictionary(dictionary_path, term_index, count_index):
        print("Dictionary file not found")
        return

    # max edit distance per lookup (per single word, not per whole input string)
    max_edit_distance_lookup = 3
    f = open("note.html", "r")
    noteString = f.read()
    noteString = stripHTML(noteString)
    print(noteString)
    input_term = ("whereis th elove hehad dated forImuch of thepast who "
                  "couqdn'tread in sixtgrade and ins pired him. But who aree yooui to say its not. I am.")

    tstart = datetime.now()
    suggestions = sym_spell.lookup_compound(noteString,
                                            max_edit_distance_lookup)
    # display suggestion term, edit distance, and term frequency
    for suggestion in suggestions:
        print("{}, {}, {}".format(suggestion.term, suggestion.distance,
                                  suggestion.count))

    tend = datetime.now()
    time = tend - tstart
    print(time.seconds)
Beispiel #22
0
def create_context_speller():
  """Creates a context speller, which uses the context frequency lookup table"""

  # Initialize Context Symspell Checker
  context_sym_spell = SymSpell(83000, 2, 7)

  # load dictionary
  lookup_path = os.path.join(os.path.dirname(
      __file__), "./data/dict/context_dist_small.txt")

  if not context_sym_spell.load_dictionary(lookup_path, 0, 1):
    raise Exception("Dictionary file not found")

  # Creates the spell checker
  def check_spell(word): 
    suggestions = context_sym_spell.lookup(word, Verbosity.CLOSEST, 2)
    if len(suggestions) == 0:
      # Not in context
      return True
    else:
      correct = True
      for suggestion in suggestions:
        if suggestion.distance == 1:
          correct = False
        
      return correct
  
  return check_spell
Beispiel #23
0
def correctly_spelled(data, max_edit_distance_lookup=None):
    global sym_speller  # Make the SymspellPy-based speller global to be able to be used in the body of this function
    if sym_speller is None:  # If the speller is not initialized
        sym_speller = SymSpell(
            max_edit_distance_dictionary,
            prefix_length)  # Initialize the speller provided its parameters as
        # previously defined
        sym_spell_dict_path = os.path.join(os.path.dirname(__file__),
                                           "frequency_dictionary_en_82_765.txt"
                                           )  # Load the frequency dictionary
        # to the speller
        term_index = 0  # Column of the term in the dictionary text file
        count_index = 1  # Column of the term frequency in the dictionary text file
        if not sym_speller.load_dictionary(
                sym_spell_dict_path, term_index,
                count_index):  # If the dictionary was not found
            print("ERROR! SymSpellPy dictionary not found at following path:",
                  sym_spell_dict_path
                  )  # Print error message informing about this
            os._exit(1)  # Exit the entire program

    if max_edit_distance_lookup is None:  # If no maximum edit distance during lookup is specified
        max_edit_distance_lookup = max_edit_distance_dictionary  # Assign the same edit distance to that as to the maximum edit distance
        # on the dictionary

    # Correct spelling of each token in the text and return the data sample
    return " ".join([
        (sym_speller.lookup_compound(t, max_edit_distance_lookup)[0].term if
         t.isalpha() and not (t == data[0] or t == data[1] or
                              ("".join([x[0] for x in data[1].split()]) == t if
                               len(data[1].split()) >= 3 else False)) else t)
        for t in tokenized(data[2])
    ])
Beispiel #24
0
def getSymspellDict(direc):
    print("loading symspell object")
    sym_spell = SymSpell(83000, 2, 7)
    if not sym_spell.load_dictionary(direc, 0, 1):
        print("Dictionary file not found")

    return sym_spell
Beispiel #25
0
def spell_correction(texte):
    max_edit_distance_dictionary = 2
    prefix_length = 7
    sym_spell = SymSpell(max_edit_distance_dictionary, prefix_length)
    dictionary_path = "../ressources/fr-100k.txt"
    bigram_path = pkg_resources.resource_filename(
        "symspellpy", "frequency_bigramdictionary_en_243_342.txt")
    if not sym_spell.load_dictionary(
            dictionary_path, term_index=0, count_index=1):
        print("Dictionary file not found")
        return
    if not sym_spell.load_bigram_dictionary(
            bigram_path, term_index=0, count_index=2):
        print("Bigram dictionary file not found")
        return
    input_term = texte
    # max edit distance per lookup (per single word, not per whole input string)
    max_edit_distance_lookup = 2
    suggestions = sym_spell.lookup_compound(input_term,
                                            max_edit_distance_lookup)
    # display suggestion term, edit distance, and term frequency
    for suggestion in suggestions:
        print("{}, {}, {}".format(suggestion.term, suggestion.distance,
                                  suggestion.count))
    if (len(suggestions) > 0):
        return suggestions[0].term
    else:
        print("error with : ", texte)
        return texte
Beispiel #26
0
    class SpellCorrector():
        def __init__(self, max_edit_distance_dictionary=2, prefix_length=7):
            self.sym_spell = SymSpell(max_edit_distance_dictionary, prefix_length)
            # load dictionary
            dictionary_path = os.path.join(os.path.dirname('../'),
                                           "frequency_dictionary_en_82_765.txt")
            term_index = 0  # column of the term in the dictionary text file
            count_index = 1  # column of the term frequency in the dictionary text file
            if not self.sym_spell.load_dictionary(dictionary_path, term_index, count_index):
                raise("Dictionary file not found")

            # manually
            # this works. about 0.003 up
            # self.corr_dict = {"awsome": "awesome"}

        def reduce_lengthening(self, text):
            # not work
            pattern = re.compile(r"(.)\1{2,}")
            return pattern.sub(r"\1\1", text)

        def strip_punc(self, word):
            # not work
            return re.sub(r"[\-\_\.\!]$", "", word)

        def __call__(self, word):
            word = self.reduce_lengthening(word)
            # if word in self.corr_dict:
            #     word = self.corr_dict[word]
            if len(word) > 2 and "'" not in word:
                suggestions = self.sym_spell.lookup(word, Verbosity.CLOSEST, 2)
                if suggestions:
                    return suggestions[0].term
            return word
 def load(cls, language: str) -> "SpellCorrectGenerator":
     # maximum edit distance per dictionary pre-calculation
     max_edit_distance_dictionary = 2
     prefix_length = 7
     # create object
     sym_spell = SymSpell(max_edit_distance_dictionary, prefix_length)
     if language == "en":
         dict_path = (
             pathlib.Path(__file__).parent
             / "resources"
             / "frequency_dictionary_en_82_765.txt"
         )
         sym_spell.load_dictionary(str(dict_path), term_index=0, count_index=1)
         spacy_model = spacy.load("en_core_web_sm", disable=["parser", "ner"])
     else:
         raise RuntimeError(f"The language {language} is currently not language.")
     return cls(sym_spell, spacy_model)
def main():
    # maximum edit distance per dictionary precalculation
    max_edit_distance_dictionary = 2
    prefix_length = 9
    # data = pd.read_csv('D:/ML/QNA_project/CSV_files/final_words_total_rd2.csv')

    sym_spell = SymSpell(max_edit_distance_dictionary, prefix_length)

    dictionary_path = os.path.join(os.path.dirname(__file__),
                                   "dictionary_final.txt")
    term_index = 0  # column of the term in the dictionary text file
    count_index = 1  #
    if not sym_spell.load_dictionary(dictionary_path, term_index, count_index):
        print("Dictionary file not found")
        return
    # lookup suggestions for single-word input strings

    # input_term = "agricultr"  # misspelling of "members"
    # max edit distance per lookup
    # (max_edit_distance_lookup <= max_edit_distance_dictionary)
    max_edit_distance_lookup = 2

    suggestion_verbosity = Verbosity.CLOSEST  # TOP, CLOSEST, ALL
    s = ""
    # print('original')
    # print(len(words))
    # for i in range(len(data)):
    #     # print(i)
    #     if i==0 or i==51124 or i==65070:
    #         continue
    #     input_term = data['Final_words'][i]
    #     suggestions = sym_spell.lookup(input_term, suggestion_verbosity,
    #                                max_edit_distance_lookup)
    #     print(i)
    #     try:
    #         s = s + str(suggestions[0].term)+" "
    #     except:
    #         s = s+ input_term
    #
    # s = s[:-1]
    # words = s.split(' ')
    # # print(len(words))
    # print('After')
    # print(len(words))
    # for suggestion in suggestions:
    #     print("{}, {}, {}".format(suggestion.term, suggestion.distance,
    #                               suggestion.count))

    # input_term = ("whereis th elove hehad dated forImuch of thepast who "
    #               "couqdn'tread in sixtgrade and ins pired him")
    input_term = 'live'
    # max_edit_distance_lookup = 2
    suggestions = sym_spell.lookup_compound(input_term,
                                            max_edit_distance_lookup)
    for suggestion in suggestions:
        print("{}, {}, {}".format(suggestion.term, suggestion.distance,
                                  suggestion.count))
def symspell_test(tokenpos_list,
                  max_edit_distance_lookup=3,
                  initial_capacity=83000,
                  max_edit_distance_dictionary=3,
                  prefix_length=7,
                  term_index=0,
                  count_index=1):
    """
    This is a function that tests the SymSpell library for spell-checking performance.
    Key-word arguments are:
        ** max_edit_distance_lookup : (Recommended maximum = 3)
        ** term_index : term column in dictionary (0)
        ** count_index : frequency column in dictionary (1)
    """
    print('\n{} \nBegin \'Symspellpy\' testing \n'.format('#' * 20))

    try:
        sym_spell = SymSpell(initial_capacity, max_edit_distance_dictionary,
                             prefix_length)
        suggestion_verbosity = Verbosity.CLOSEST

        dictionary_path = os.path.join(os.path.dirname(__file__),
                                       "frequency_dictionary_en_82_765.txt")
        if not sym_spell.load_dictionary(dictionary_path, term_index,
                                         count_index):
            print("Dictionary file not found")
            return 'Error loading dictionary file'
        suggestion_list = []
        proper_noun = []

        for (word, pos) in tokenpos_list:
            if pos == 'PROPN':
                suggestion_list.append(word)
                proper_noun.append(word)
            elif len(word) < 3:
                suggestion_list.append(word)
                proper_noun.append(word)
            else:
                suggestions = sym_spell.lookup(word, suggestion_verbosity,
                                               max_edit_distance_lookup)
                suggestion = (list(suggestions))[0]
                # display suggestion term, term frequency, and edit distance
                print(
                    "input_term = {}, suggestion_term = {}, suggestion_count = {},\
                suggestion_distance =  {}".format(word, suggestion.term,
                                                  suggestion.count,
                                                  suggestion.distance))
                suggestion_list.append(suggestion.term)
        print("\n\nThe corrected sentence is : {}".format(
            ' '.join(suggestion_list)))
        print(suggestion_list)
        print(proper_noun)
        return suggestion_list, proper_noun
    except TypeError as error:
        print(f'Invalid type : {error}')
        return 405
Beispiel #30
0
def init():
    ''' Init symspellpy, loading the frequency words models
    (dictionary and bigram dictionary)
    '''
    global sym_spell
    max_edit_distance_dictionary = 2
    prefix_length = 7

    sym_spell = SymSpell(max_edit_distance_dictionary, prefix_length)
    #sym_spell.load_dictionary(os.path.dirname(os.path.abspath(__file__)) + "/frequency_words_models/pt_frequency_50k.txt", term_index=0, count_index=1)
    sym_spell.load_dictionary(os.path.dirname(os.path.abspath(__file__)) +
                              "/frequency_words_models/fw_pt.txt",
                              term_index=0,
                              count_index=1)
    sym_spell.load_bigram_dictionary(
        os.path.dirname(os.path.abspath(__file__)) +
        "/frequency_words_models/fw_bi_pt.txt",
        term_index=0,
        count_index=2)
Beispiel #31
0
def main(argv):
    if len(argv) == 3:
        input = argv[1]
        markdown = argv[2]
    else:
        print ('usage:\n    python .py "<categoria>" <markdown gerado>')
        return
    initial_capacity = 83000
    max_edit_distance_dictionary = 3
    prefix_length = 7
    sym_spell = SymSpell(initial_capacity, max_edit_distance_dictionary, prefix_length)
    dictionary_path = "category_count.txt"
    term_index = 0  # column of the term in the dictionary text file
    count_index = 1  # column of the term frequency in the dictionary text file

    if not sym_spell.load_dictionary(dictionary_path, term_index, count_index):
        print("Dictionary file not found")
        return

    categorys = open(dictionary_path, 'r')
    d = defaultdict(lambda: 0)
    for x in categorys.readlines():
        z = x.split(' ')
        d[z[0]] = z[2]

    f = open(markdown, 'a')
    f.write('\n## Lista de Exercicios - %s\n' % (input).capitalize())
    input = input.lower()
    suggestion_verbosity = Verbosity.CLOSEST  # TOP, CLOSEST, ALL
    inputs = input.split(' ')
    total_avg = sum( map(len, inputs) ) / len(inputs)

    max_edit_distance_lookup = 3 if total_avg > 4 else 2
    for input_term in inputs:
        suggestions = sym_spell.lookup(input_term, suggestion_verbosity, max_edit_distance_lookup)
        for suggestion in suggestions:
            f.write("* {}, https://a2oj.com/{}".format((suggestion.term).capitalize(), d[suggestion.term]))

    f.close()
    categorys.close()