Python SymSpell.SymSpell Examples, symspellpy.symspellpy.SymSpell.SymSpell Python Examples

Example #1

0

Show file

File: symspell_eg2.py Project: shamrock222/nlp-learning

def main():
    initial_capacity = 83000
    # maximum edit distance per dictionary precalculation
    max_edit_distance_dictionary = 0
    prefix_length = 7
    sym_spell = SymSpell(initial_capacity, max_edit_distance_dictionary,
                         prefix_length)
    # load dictionary
    dictionary_path = os.path.join(
        os.path.dirname(__file__), "./data/frequency_dictionary_en_82_765.txt")
    term_index = 0  # column of the term in the dictionary text file
    count_index = 1  # column of the term frequency in the dictionary text file
    if not sym_spell.load_dictionary(dictionary_path, term_index, count_index):
        print("Dictionary file not found")
        return

    edit_distance_max = 0
    prefix_length = 7
    sym_spell = SymSpell(83000, edit_distance_max, prefix_length)
    sym_spell.load_dictionary(dictionary_path, 0, 1)

    typo = "thequickbrownfoxjumpsoverthelazydog"
    correction = "the quick brown fox jumps over the lazy dog"
    result = sym_spell.word_segmentation(typo)  # create object

    # a sentence without any spaces
    input_term = "thequickbrownfoxjumpsoverthelazydog"
    result = sym_spell.word_segmentation(input_term)
    # display suggestion term, term frequency, and edit distance
    print("{}, {}, {}".format(result.corrected_string, result.distance_sum,
                              result.log_prob_sum))

Example #2

0

Show file

File: streamlitScan2textV1.py Project: mw0/MLnotebooks

def initializeSymspell():
    print("inside initializeSymspell()")
    symspell = SymSpell(max_dictionary_edit_distance=2, prefix_length=7)
    print("symspell created")
    resourceNames = [
        "symspellpy", "frequency_dictionary_en_82_765.txt",
        "frequency_bigramdictionary_en_243_342.txt"
    ]
    dictionaryPath = pkg_resources.resource_filename(resourceNames[0],
                                                     resourceNames[1])
    bigramPath = pkg_resources.resource_filename(resourceNames[0],
                                                 resourceNames[2])
    print("dictionaryPath created")
    symspell.load_dictionary(dictionaryPath, 0, 1)
    symspell.create_dictionary_entry(key='ap', count=500000000)
    print(list(islice(symspell.words.items(), 5)))
    print("symspell.load_ditionary() done")
    symspell.load_bigram_dictionary(bigramPath, 0, 1)
    print(list(islice(symspell.bigrams.items(), 5)))
    print("symspell.load_bigram_ditionary() done")

    # Create vocab
    vocab = set([w for w, f in symspell.words.items()])

    return symspell, vocab

Example #3

0

Show file

def spell_correction(texte):
    max_edit_distance_dictionary = 2
    prefix_length = 7
    sym_spell = SymSpell(max_edit_distance_dictionary, prefix_length)
    dictionary_path = "../ressources/fr-100k.txt"
    bigram_path = pkg_resources.resource_filename(
        "symspellpy", "frequency_bigramdictionary_en_243_342.txt")
    if not sym_spell.load_dictionary(
            dictionary_path, term_index=0, count_index=1):
        print("Dictionary file not found")
        return
    if not sym_spell.load_bigram_dictionary(
            bigram_path, term_index=0, count_index=2):
        print("Bigram dictionary file not found")
        return
    input_term = texte
    # max edit distance per lookup (per single word, not per whole input string)
    max_edit_distance_lookup = 2
    suggestions = sym_spell.lookup_compound(input_term,
                                            max_edit_distance_lookup)
    # display suggestion term, edit distance, and term frequency
    for suggestion in suggestions:
        print("{}, {}, {}".format(suggestion.term, suggestion.distance,
                                  suggestion.count))
    if (len(suggestions) > 0):
        return suggestions[0].term
    else:
        print("error with : ", texte)
        return texte

Example #4

0

Show file

 def __init__(self, lm, max_ed=4, prefix_length=7, l=1, channel_method_poisson=True, channel_prob_param=0.02):
     self.show_progress = False
     self.lm = lm
     self.l = l
     self.channel_method_poisson = channel_method_poisson
     self.channel_prob_param = channel_prob_param
     
     self.sym_spell = SymSpell(max_ed, prefix_length)
     
     if isinstance(self.lm, GPT2LMHeadModel):
         self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
         self.lm_sent_logscore = self.gpt2_sent_logscore
         self.beam_init = self.beam_GPT_init
         self.skipstart = 1
         self.skipend = -1
         self.update_sentence_history = self.updateGPT2history
         self.tokenizer = GPT2Tokenizer.from_pretrained('gpt2')
         for subword in range(self.tokenizer.vocab_size):
             self.sym_spell.create_dictionary_entry(key=self.tokenizer.decode(subword), count=1)
     else:
         self.lm_sent_logscore = self.ngram_sent_logscore
         self.beam_init = self.beam_ngram_init
         self.skipstart = self.lm.order-1
         self.skipend = None
         self.update_sentence_history = self.updatengramhistory
         self.tokenizer = ngramTokenizer(self.lm)
         for word in lm.vocab:
             self.sym_spell.create_dictionary_entry(key=word, count=self.lm.counts[word])

Example #5

0

Show file

File: lookup.py Project: ujicaesar95/Sentiment-analysis

def main():
    # maximum edit distance per dictionary precalculation
    max_edit_distance_dictionary = 2
    prefix_length = 7

    # create object
    sym_spell = SymSpell(max_edit_distance_dictionary, prefix_length)

    # load dictionary
    dictionary_path = os.path.join(os.path.dirname(__file__),
                                   "corpus/dictionary/dictionary.txt")
    # dictionary_path = os.path.join(os.path.dirname(__file__), "corpus/symspellpy/frequency_dictionary_en_82_765.txt")
    term_index = 0  # column of the term in the dictionary text file
    count_index = 1  # column of the term frequency in the dictionary text file
    if not sym_spell.load_dictionary(dictionary_path, term_index, count_index):
        print("Dictionary file not found")
        return

    # lookup suggestions for single-word input strings
    input_term = "bangeeet"  # misspelling
    # max edit distance per lookup
    # (max_edit_distance_lookup <= max_edit_distance_dictionary)
    max_edit_distance_lookup = 2
    suggestion_verbosity = Verbosity.CLOSEST  # TOP, CLOSEST, ALL
    suggestions = sym_spell.lookup(input_term, suggestion_verbosity,
                                   max_edit_distance_lookup)
    # display suggestion term, term frequency, and edit distance
    for suggestion in suggestions:
        print("{}, {}, {}".format(suggestion.term, suggestion.distance,
                                  suggestion.count))

Example #6

0

Show file

File: spellCheckTest.py Project: Bro-ssessment/Bro-ssessment

def main():
    # maximum edit distance per dictionary precalculation
    max_edit_distance_dictionary = 3
    prefix_length = 7
    # create object
    sym_spell = SymSpell(max_edit_distance_dictionary, prefix_length)
    # load dictionary
    dictionary_path = os.path.join(os.path.dirname(__file__),
                                   "frequency_dictionary_en_82_765.txt")
    term_index = 0  # column of the term in the dictionary text file
    count_index = 1  # column of the term frequency in the dictionary text file
    if not sym_spell.load_dictionary(dictionary_path, term_index, count_index):
        print("Dictionary file not found")
        return

    # max edit distance per lookup (per single word, not per whole input string)
    max_edit_distance_lookup = 3
    f = open("note.html", "r")
    noteString = f.read()
    noteString = stripHTML(noteString)
    print(noteString)
    input_term = ("whereis th elove hehad dated forImuch of thepast who "
                  "couqdn'tread in sixtgrade and ins pired him. But who aree yooui to say its not. I am.")

    tstart = datetime.now()
    suggestions = sym_spell.lookup_compound(noteString,
                                            max_edit_distance_lookup)
    # display suggestion term, edit distance, and term frequency
    for suggestion in suggestions:
        print("{}, {}, {}".format(suggestion.term, suggestion.distance,
                                  suggestion.count))

    tend = datetime.now()
    time = tend - tstart
    print(time.seconds)

Example #7

0

Show file

File: SymSpellDemo.py Project: destro-2698/vectorentry-backend

    def __init__(self,
                 max_dictionary_edit_distance=2,
                 prefix_length=7,
                 dictionary_path=None):
        # maximum edit-distance for doing lookups
        self.max_dictionary_edit_distance = max_dictionary_edit_distance

        # Length of word prefixes used for spell checking
        self.prefix_length = prefix_length

        # create object
        self.sym_spell = SymSpell(
            max_dictionary_edit_distance=self.max_dictionary_edit_distance,
            prefix_length=self.prefix_length)

        # load dictionary
        if dictionary_path is None:
            dictionary_path = os.path.join(
                os.path.dirname('__file__'),
                "frequency_dictionary_en_82_765.txt")

        term_index = 0  # column of the term in the dictionary text file
        count_index = 1  # column of the term frequency in the dictionary text file

        if not self.sym_spell.load_dictionary(dictionary_path, term_index,
                                              count_index):
            print('Dictionary file not found')

Example #8

0

Show file

File: spellcheck.py Project: folagit/examples

def setup(initial_capacity=83000,
          prefix_length=7,
          max_edit_distance_dictionary=2):

    global maximum_edit_distance
    maximum_edit_distance = max_edit_distance_dictionary

    dict_path = '/home/fa6/data/symspellpy/frequency_dictionary_en_82_765.txt'
    sym_spell = SymSpell(initial_capacity,
                         max_edit_distance_dictionary,
                         prefix_length,
                         count_threshold=30)

    term_index = 0  # column of the term in the dictionary text file
    count_index = 1  # column of the term frequency in the dictionary text file

    if not sym_spell.load_dictionary(dict_path, term_index, count_index):
        print("Dictionary file not found")
        return

    # lookup suggestions for single-word input strings
    # input_term = "memebers"  # misspelling of "members"
    # max edit distance per lookup
    # (max_edit_distance_lookup <= max_edit_distance_dictionary)
    # max_edit_distance_lookup = 2
    # suggestion_verbosity = Verbosity.CLOSEST  # TOP, CLOSEST, ALL
    # suggestions = sym_spell.lookup(input_term, suggestion_verbosity,
    #                                max_edit_distance_lookup)
    # # display suggestion term, term frequency, and edit distance
    # for suggestion in suggestions:
    #     print("{}, {}, {}".format(suggestion.term, suggestion.count,
    #                               suggestion.distance))

    return sym_spell

Example #9

0

Show file

File: spell.py Project: huseinzol05/malaya

def symspell(max_edit_distance_dictionary: int = 2,
             prefix_length: int = 7,
             term_index: int = 0,
             count_index: int = 1,
             top_k: int = 10,
             **kwargs):
    """
    Load a symspell Spell Corrector for Malay.

    Returns
    -------
    result: malaya.spell.Symspell class
    """

    try:
        from symspellpy.symspellpy import SymSpell, Verbosity
    except BaseException:
        raise ModuleNotFoundError(
            'symspellpy not installed. Please install it and try again.')

    path = check_file(PATH_NGRAM['symspell'], S3_PATH_NGRAM['symspell'],
                      **kwargs)
    sym_spell = SymSpell(max_edit_distance_dictionary, prefix_length)
    sym_spell.load_dictionary(path['model'], term_index, count_index)

    path = check_file(PATH_NGRAM[1], S3_PATH_NGRAM[1], **kwargs)
    with open(path['model']) as fopen:
        corpus = json.load(fopen)
    return Symspell(sym_spell, Verbosity.ALL, corpus, k=top_k)

Example #10

0

Show file

File: methods.py Project: wfearn/preprocessing-paper

def spelling_preprocessor():
    import os
    from symspellpy.symspellpy import SymSpell, Verbosity

    max_edit_distance_dictionary = 2
    prefix_length = 7

    sc = SymSpell(max_edit_distance_dictionary, prefix_length)
    dictionary_path = os.path.join(os.getenv('HOME'), 'symspellpy/symspellpy/frequency_dictionary_en_82_765.txt')
    term_index = 0
    count_index = 1

    if not sc.load_dictionary(dictionary_path, term_index, count_index):
        raise ImportError('Unable to load spelling dictionary')

    max_edit_distance_lookup = 2
    suggestion_verbosity = Verbosity.CLOSEST

    @string_check
    def checker(s):
        words = s.split()
        corrected_words = list()

        for word in words:
            correction = sc.lookup(word, suggestion_verbosity, max_edit_distance_lookup)
            if correction:
                corrected_words.append(correction[0].term)
            else:
                corrected_words.append(word)
        return ' '.join(corrected_words)
    return checker

Example #11

0

Show file

File: SymSpell.py Project: EduardLimonov/NNR-for-cyr-text-n-formulas

def createSymSpell(dict='ru-100k.txt', encoding='utf-8'):
    symspell = SymSpell(max_dictionary_edit_distance=2, prefix_length=5)
    symspell.load_dictionary(dict,
                             encoding=encoding,
                             term_index=0,
                             count_index=1)
    return symspell

Example #12

0

Show file

def create_context_speller():
  """Creates a context speller, which uses the context frequency lookup table"""

  # Initialize Context Symspell Checker
  context_sym_spell = SymSpell(83000, 2, 7)

  # load dictionary
  lookup_path = os.path.join(os.path.dirname(
      __file__), "./data/dict/context_dist_small.txt")

  if not context_sym_spell.load_dictionary(lookup_path, 0, 1):
    raise Exception("Dictionary file not found")

  # Creates the spell checker
  def check_spell(word): 
    suggestions = context_sym_spell.lookup(word, Verbosity.CLOSEST, 2)
    if len(suggestions) == 0:
      # Not in context
      return True
    else:
      correct = True
      for suggestion in suggestions:
        if suggestion.distance == 1:
          correct = False
        
      return correct
  
  return check_spell

Example #13

0

Show file

    def test_words_from_list_with_shared_prefix_should_retain_counts(self):
        print('  - %s' % inspect.stack()[0][3])
        sym_spell = SymSpell(16,
                             1,
                             3,
                             words=[
                                 "pipe", "pipe", "pipe", "pipe", "pipe",
                                 "pips", "pips", "pips", "pips", "pips",
                                 "pips", "pips", "pips", "pips", "pips"
                             ])

        result = sym_spell.lookup("pipe", Verbosity.ALL, 1)
        self.assertEqual(2, len(result))
        self.assertEqual("pipe", result[0].term)
        self.assertEqual(5, result[0].count)
        self.assertEqual("pips", result[1].term)
        self.assertEqual(10, result[1].count)

        result = sym_spell.lookup("pips", Verbosity.ALL, 1)
        self.assertEqual(2, len(result))
        self.assertEqual("pips", result[0].term)
        self.assertEqual(10, result[0].count)
        self.assertEqual("pipe", result[1].term)
        self.assertEqual(5, result[1].count)

        result = sym_spell.lookup("pip", Verbosity.ALL, 1)
        self.assertEqual(2, len(result))
        self.assertEqual("pips", result[0].term)
        self.assertEqual(10, result[0].count)
        self.assertEqual("pipe", result[1].term)
        self.assertEqual(5, result[1].count)

Example #14

0

Show file

def correct_spelling(sentence):
    # maximum edit distance per dictionary precalculation
    max_edit_distance_dictionary = 2
    prefix_length = 5
    # create object
    sym_spell = SymSpell(max_edit_distance_dictionary, prefix_length)
    # load dictionary
    dictionary_path = os.path.join(os.path.dirname(__file__),
                                   "frequency_dictionary_en_82_765.txt")
    term_index = 0  # column of the term in the dictionary text file
    count_index = 1  # column of the term frequency in the dictionary text file
    if not sym_spell.load_dictionary(dictionary_path, term_index, count_index):
        print("Dictionary file not found")
        return
    if "&amp ;" in sentence:
        sentence = sentence.replace("&amp ;", "and")
    max_edit_distance_lookup = 2
    suggestions = sym_spell.lookup_compound(sentence, max_edit_distance_lookup)
    save = ""
    for suggestion in suggestions:
        save = suggestion.term
        #print("{}".format(save))
        break

    #if "#" in save:
    #    save = sym_spell.word_segmentation(save)

    return save

Example #15

0

Show file

    def test_words_with_shared_prefix_should_retain_counts(self):
        print('  - %s' % inspect.stack()[0][3])
        sym_spell = SymSpell(16, 1, 3)
        sym_spell.create_dictionary_entry("pipe", 5)
        sym_spell.create_dictionary_entry("pips", 10)

        result = sym_spell.lookup("pipe", Verbosity.ALL, 1)
        self.assertEqual(2, len(result))
        self.assertEqual("pipe", result[0].term)
        self.assertEqual(5, result[0].count)
        self.assertEqual("pips", result[1].term)
        self.assertEqual(10, result[1].count)

        result = sym_spell.lookup("pips", Verbosity.ALL, 1)
        self.assertEqual(2, len(result))
        self.assertEqual("pips", result[0].term)
        self.assertEqual(10, result[0].count)
        self.assertEqual("pipe", result[1].term)
        self.assertEqual(5, result[1].count)

        result = sym_spell.lookup("pip", Verbosity.ALL, 1)
        self.assertEqual(2, len(result))
        self.assertEqual("pips", result[0].term)
        self.assertEqual(10, result[0].count)
        self.assertEqual("pipe", result[1].term)
        self.assertEqual(5, result[1].count)

Example #16

0

Show file

File: read_data.py Project: myeditha/switchsand

def getSymspellDict(direc):
    print("loading symspell object")
    sym_spell = SymSpell(83000, 2, 7)
    if not sym_spell.load_dictionary(direc, 0, 1):
        print("Dictionary file not found")

    return sym_spell

Example #17

0

Show file

File: spell.py Project: justinphan3110/Malaya

def symspell(max_edit_distance_dictionary: int = 2,
             prefix_length: int = 7,
             term_index: int = 0,
             count_index: int = 1,
             top_k: int = 10,
             **kwargs):
    """
    Train a symspell Spell Corrector.

    Returns
    -------
    result: malaya.spell.SYMSPELL class
    """

    check_file(PATH_NGRAM['symspell'], S3_PATH_NGRAM['symspell'], **kwargs)
    check_file(PATH_NGRAM[1], S3_PATH_NGRAM[1], **kwargs)

    try:
        from symspellpy.symspellpy import SymSpell, Verbosity
    except:
        raise Exception(
            'symspellpy not installed. Please install it and try again.')
    sym_spell = SymSpell(max_edit_distance_dictionary, prefix_length)
    dictionary_path = PATH_NGRAM['symspell']['model']
    sym_spell.load_dictionary(dictionary_path, term_index, count_index)
    with open(PATH_NGRAM[1]['model']) as fopen:
        corpus = json.load(fopen)
    return SYMSPELL(sym_spell, Verbosity.ALL, corpus, k=top_k)

Example #18

0

Show file

def main():
    # maximum edit distance per dictionary precalculation
    max_edit_distance_dictionary = 2
    prefix_length = 7
    # create object
    sym_spell = SymSpell(max_edit_distance_dictionary, prefix_length)
    # load dictionary
    dictionary_path = os.path.join(os.path.dirname(__file__),
                                   "frequency_dictionary_en_82_765.txt")
    term_index = 0  # column of the term in the dictionary text file
    count_index = 1  # column of the term frequency in the dictionary text file
    if not sym_spell.load_dictionary(dictionary_path, term_index, count_index):
        print("Dictionary file not found")
        return

    sym_spell.load_dictionary(
        "/home/yadi/projectDISK/Python-Projects/ML-NLP/dictionary.txt", 0, 1)

    # lookup suggestions for multi-word input strings (supports compound
    # splitting & merging)
    input_term = ("whereis th elove hehad dated forImuch of thepast who "
                  "couqdn'tread in sixtgrade and ins pired him."
                  "I'm workig in th e yadolah shahrary working in githib")
    # max edit distance per lookup (per single word, not per whole input string)
    max_edit_distance_lookup = 1
    suggestions = sym_spell.lookup_compound(input_term,
                                            max_edit_distance_lookup,
                                            transfer_casing=True)
    # display suggestion term, edit distance, and term frequency
    print(input_term)
    for suggestion in suggestions:
        print("{}".format(suggestion.term))

Example #19

0

Show file

File: spell.py Project: ujicaesar95/Sentiment-analysis

    def __init__(self,
                 train=False,
                 save=False,
                 corpus_path=CORPUS_PATH,
                 threshold=2):

        self.slang_dict = pickle.load(
            open(
                os.path.join(os.path.dirname(__file__),
                             "pickled/_slang_words.p"), "rb"))
        self.slang_dict['dr'] = 'dari'
        self.slang_dict['k'] = 'ke'
        self.slang_dict['sc'] = 'sesar'

        if train:
            create_dictionary.main()
            self.words = self.__words(corpus_path)
            self.counter = self.__counter(self.words)
            self.model = model.LanguageModel(corpus_path=corpus_path)
        else:
            self.words = pickle.load(
                open(
                    os.path.join(os.path.dirname(__file__),
                                 "pickled/_spell_words.p"), "rb"))
            self.counter = pickle.load(
                open(
                    os.path.join(os.path.dirname(__file__),
                                 "pickled/_spell_counter.p"), "rb"))
            self.model = model.LanguageModel(load=True)

        try:
            for key in self.counter:
                if self.counter[key] <= threshold:
                    self.words.remove(key)
        except:
            pass

        self.candidates_dict = {}

        # maximum edit distance per dictionary precalculation
        max_edit_distance_dictionary = 2
        prefix_length = 7

        # create object
        self.sym_spell = SymSpell(max_edit_distance_dictionary, prefix_length)
        self.factory = StemmerFactory()
        self.stemmer = self.factory.create_stemmer()
        # load dictionary
        dictionary_path = os.path.join(os.path.dirname(__file__),
                                       "corpus/dictionary/dictionary.txt")
        # dictionary_path = os.path.join(os.path.dirname(__file__), "corpus/symspellpy/frequency_dictionary_en_82_765.txt")
        term_index = 0  # column of the term in the dictionary text file
        count_index = 1  # column of the term frequency in the dictionary text file
        if not self.sym_spell.load_dictionary(
                dictionary_path, term_index, count_index, encoding="utf-8"):
            print("Dictionary file not found")
            return

        if save == True:
            self.save()

Example #20

0

Show file

def correctly_spelled(data, max_edit_distance_lookup=None):
    global sym_speller  # Make the SymspellPy-based speller global to be able to be used in the body of this function
    if sym_speller is None:  # If the speller is not initialized
        sym_speller = SymSpell(
            max_edit_distance_dictionary,
            prefix_length)  # Initialize the speller provided its parameters as
        # previously defined
        sym_spell_dict_path = os.path.join(os.path.dirname(__file__),
                                           "frequency_dictionary_en_82_765.txt"
                                           )  # Load the frequency dictionary
        # to the speller
        term_index = 0  # Column of the term in the dictionary text file
        count_index = 1  # Column of the term frequency in the dictionary text file
        if not sym_speller.load_dictionary(
                sym_spell_dict_path, term_index,
                count_index):  # If the dictionary was not found
            print("ERROR! SymSpellPy dictionary not found at following path:",
                  sym_spell_dict_path
                  )  # Print error message informing about this
            os._exit(1)  # Exit the entire program

    if max_edit_distance_lookup is None:  # If no maximum edit distance during lookup is specified
        max_edit_distance_lookup = max_edit_distance_dictionary  # Assign the same edit distance to that as to the maximum edit distance
        # on the dictionary

    # Correct spelling of each token in the text and return the data sample
    return " ".join([
        (sym_speller.lookup_compound(t, max_edit_distance_lookup)[0].term if
         t.isalpha() and not (t == data[0] or t == data[1] or
                              ("".join([x[0] for x in data[1].split()]) == t if
                               len(data[1].split()) >= 3 else False)) else t)
        for t in tokenized(data[2])
    ])

Example #21

0

Show file

File: Preprocessing_and_Spell_Correction.py Project: jaykasundra2/NLP

def spelling_correction(data,column):
    from symspellpy.symspellpy import SymSpell , Verbosity
    # maximum edit distance per dictionary precalculation
    max_edit_distance_dictionary = 2
    prefix_length = 7
    # create object
    sym_spell = SymSpell(max_edit_distance_dictionary, prefix_length)
    # load dictionary
    dictionary_path = "frequency_dictionary_en_82_765.txt"
    term_index = 0  # column of the term in the dictionary text file
    count_index = 1  # column of the term frequency in the dictionary text file
    if not sym_spell.load_dictionary(dictionary_path, term_index, count_index):
       print("Dictionary file not found")

    max_edit_distance_lookup = 2
    suggestion_verbosity = Verbosity.CLOSEST  # TOP, CLOSEST, ALL
    df_final = pd.DataFrame()
    for index , row in data.iterrows():
        # lookup suggestions for single-word input strings
        text = row[column]
        # max edit distance per lookup
        # (max_edit_distance_lookup <= max_edit_distance_dictionary)
        for input_term in text.split():
            suggestions = sym_spell.lookup(input_term, suggestion_verbosity,
                                       max_edit_distance_lookup)
            if len(suggestions)>0:
                df_local = pd.DataFrame({'Original Word':[input_term],'Replacement':[suggestions[0].term]})        
                df_final = df_final.append(df_local)
    return df_final

Example #22

0

Show file

File: cleaner.py Project: AhsanAliLodhi/kaggle-quora-tensorflow

    def initialize(self):
        print("Initializing Text Cleaner..")
       
        print("Initializing Smart Contractions Module..")
        self.cont = Contractions(self.embedding_for_smart_contraction)
        self.cont.load_models()
        
        print("Initializing Stopwords Module..")
        self.stop_words = set(stopwords.words('english'))
        stop_words_without_negation = copy.deepcopy(self.stop_words)
        stop_words_without_negation.remove('no')
        stop_words_without_negation.remove('nor')
        stop_words_without_negation.remove('not')
        self.stop_words_without_negation = stop_words_without_negation
        self.pos_tags_set_1 = {'NNP'}

        print("Initializing Wordnet Lemmatizer Module..")
        self.wnl = WordNetLemmatizer()
        
        print("Initializing Spellcheck Module..")
        max_edit_distance_dictionary = 2
        prefix_length = 7
        self.sym_spell = SymSpell(max_edit_distance_dictionary, prefix_length)
        dictionary_path = os.path.abspath('')+"\\"+self.spell_dictonarypath
        self.sym_spell.load_dictionary(dictionary_path, 0, 1)
        
        print("Initialization complete!")

Example #23

0

Show file

    def _create_symspell_checker(self, language: AnyStr) -> SymSpell:
        """Private method to create a SymSpell instance for a given language

        Args:
            language: Language code in ISO 639-1 format

        Returns:
            SymSpell checker instance loaded with the language dictionary

        """
        start = perf_counter()
        logging.info(f"Loading spellchecker for language '{language}'...")
        symspell_checker = SymSpell(
            max_dictionary_edit_distance=self.edit_distance)
        frequency_dict_path = self.dictionary_folder_path + "/" + language + ".txt"
        symspell_checker.load_dictionary(frequency_dict_path,
                                         term_index=0,
                                         count_index=1,
                                         encoding="utf-8")
        if len(self.custom_vocabulary_set) != 0:
            for word in self.custom_vocabulary_set:
                symspell_checker.create_dictionary_entry(key=word, count=1)
        logging.info(
            f"Loading spellchecker for language '{language}': done in {perf_counter() - start:.2f} seconds"
        )
        return symspell_checker

Example #24

0

Show file

    def test_lookup_should_replicate_noisy_results(self):
        print('  - %s' % inspect.stack()[0][3])
        cwd = path.realpath(path.dirname(__file__))
        dictionary_path = path.realpath(
            path.join(cwd, pardir, "symspellpy",
                      "frequency_dictionary_en_82_765.txt"))
        query_path = path.join(cwd, "fortests", "noisy_query_en_1000.txt")

        edit_distance_max = 2
        prefix_length = 7
        verbosity = Verbosity.CLOSEST
        sym_spell = SymSpell(83000, edit_distance_max, prefix_length)
        sym_spell.load_dictionary(dictionary_path, 0, 1)

        test_list = []
        with open(query_path, "r") as infile:
            for line in infile.readlines():
                line_parts = line.rstrip().split(" ")
                if len(line_parts) >= 2:
                    test_list.append(line_parts[0])
        result_sum = 0
        for phrase in test_list:
            result_sum += len(
                sym_spell.lookup(phrase, verbosity, edit_distance_max))
        self.assertEqual(4945, result_sum)

Example #25

0

Show file

def load_spell_checker():
    """Return spell checker"""
    if not os.path.exists("data/unigrams.txt"):
        sents = [normalize_text(" ".join(x)).split() for x in floresta.sents()]
        sents += [normalize_text(" ".join(x)).split() for x in machado.sents()]
        sents += [
            normalize_text(" ".join(x)).split() for x in mac_morpho.sents()
        ]

        unigrams = [item for sublist in sents for item in sublist]
        unigrams = nltk.probability.FreqDist(unigrams)

        file = open("data/unigrams.txt", "w")
        for k, v in unigrams.items():
            file.write(f"{k} {v}\n")
        file.close()

        bigrams = []

        for sent in sents:
            bigrams += list(nltk.bigrams(sent))

        bigrams = nltk.probability.FreqDist(bigrams)

        file = open("data/bigrams.txt", "w")
        for k, v in bigrams.items():
            file.write(f"{' '.join(k)} {v}\n")
        file.close()

    result = SymSpell()

    result.load_dictionary("data/unigrams.txt", 0, 1)
    result.load_bigram_dictionary("data/bigrams.txt", 0, 2)

    return result

Example #26

0

Show file

 def test_lookup_should_not_return_non_word_delete(self):
     print('  - %s' % inspect.stack()[0][3])
     sym_spell = SymSpell(16, 2, 7, 10)
     sym_spell.create_dictionary_entry("pawn", 10)
     result = sym_spell.lookup("paw", Verbosity.TOP, 0)
     self.assertEqual(0, len(result))
     result = sym_spell.lookup("awn", Verbosity.TOP, 0)
     self.assertEqual(0, len(result))

Example #27

0

Show file

def load_symspell(dict_path='symspell/frequency_dictionary_en_82_765.txt',
                  max_edit_distance_dictionary=2,
                  prefix_length=7,
                  term_index=0,
                  count_index=1):
    sym_spell = SymSpell(max_edit_distance_dictionary, prefix_length)
    sym_spell.load_dictionary(dict_path, term_index, count_index)
    return sym_spell

Example #28

0

Show file

File: SpellCheck.py Project: johnbickmore/GeoFinder

 def __init__(self, progress, directory, countries_dict):
     self.progress = progress
     self.logger = logging.getLogger(__name__)
     self.spelling_update = Counter()
     self.directory = directory
     self.spell_path = os.path.join(self.directory, 'spelling.pkl')
     self.countries_dict = countries_dict
     self.sym_spell = SymSpell()

Example #29

0

Show file

def symspell_dict(max_edit_dist, prefix_len):
    dictfile = DICT_DIR / "big.txt"  #downloaded from Peter Norvig's site
    sym_spell = SymSpell(max_edit_dist, prefix_len)

    #create the symspell dictionary using the dictfile
    if not sym_spell.create_dictionary(str(dictfile)):
        print("corpus file not found")
    return sym_spell

Example #30

0

Show file

 def test_lookup_should_not_return_low_count_word_that_are_also_delete_word(
         self):
     print('  - %s' % inspect.stack()[0][3])
     sym_spell = SymSpell(16, 2, 7, 10)
     sym_spell.create_dictionary_entry("flame", 20)
     sym_spell.create_dictionary_entry("flam", 1)
     result = sym_spell.lookup("flam", Verbosity.TOP, 0)
     self.assertEqual(0, len(result))