コード例 #1
0
    def initialize(self):
        print("Initializing Text Cleaner..")
       
        print("Initializing Smart Contractions Module..")
        self.cont = Contractions(self.embedding_for_smart_contraction)
        self.cont.load_models()
        
        print("Initializing Stopwords Module..")
        self.stop_words = set(stopwords.words('english'))
        stop_words_without_negation = copy.deepcopy(self.stop_words)
        stop_words_without_negation.remove('no')
        stop_words_without_negation.remove('nor')
        stop_words_without_negation.remove('not')
        self.stop_words_without_negation = stop_words_without_negation
        self.pos_tags_set_1 = {'NNP'}

        print("Initializing Wordnet Lemmatizer Module..")
        self.wnl = WordNetLemmatizer()
        
        print("Initializing Spellcheck Module..")
        max_edit_distance_dictionary = 2
        prefix_length = 7
        self.sym_spell = SymSpell(max_edit_distance_dictionary, prefix_length)
        dictionary_path = os.path.abspath('')+"\\"+self.spell_dictonarypath
        self.sym_spell.load_dictionary(dictionary_path, 0, 1)
        
        print("Initialization complete!")
コード例 #2
0
    def test_lookup_should_replicate_noisy_results(self):
        print('  - %s' % inspect.stack()[0][3])
        cwd = path.realpath(path.dirname(__file__))
        dictionary_path = path.realpath(
            path.join(cwd, pardir, "symspellpy",
                      "frequency_dictionary_en_82_765.txt"))
        query_path = path.join(cwd, "fortests", "noisy_query_en_1000.txt")

        edit_distance_max = 2
        prefix_length = 7
        verbosity = Verbosity.CLOSEST
        sym_spell = SymSpell(83000, edit_distance_max, prefix_length)
        sym_spell.load_dictionary(dictionary_path, 0, 1)

        test_list = []
        with open(query_path, "r") as infile:
            for line in infile.readlines():
                line_parts = line.rstrip().split(" ")
                if len(line_parts) >= 2:
                    test_list.append(line_parts[0])
        result_sum = 0
        for phrase in test_list:
            result_sum += len(
                sym_spell.lookup(phrase, verbosity, edit_distance_max))
        self.assertEqual(4945, result_sum)
コード例 #3
0
def spell_correction(texte):
    max_edit_distance_dictionary = 2
    prefix_length = 7
    sym_spell = SymSpell(max_edit_distance_dictionary, prefix_length)
    dictionary_path = "../ressources/fr-100k.txt"
    bigram_path = pkg_resources.resource_filename(
        "symspellpy", "frequency_bigramdictionary_en_243_342.txt")
    if not sym_spell.load_dictionary(
            dictionary_path, term_index=0, count_index=1):
        print("Dictionary file not found")
        return
    if not sym_spell.load_bigram_dictionary(
            bigram_path, term_index=0, count_index=2):
        print("Bigram dictionary file not found")
        return
    input_term = texte
    # max edit distance per lookup (per single word, not per whole input string)
    max_edit_distance_lookup = 2
    suggestions = sym_spell.lookup_compound(input_term,
                                            max_edit_distance_lookup)
    # display suggestion term, edit distance, and term frequency
    for suggestion in suggestions:
        print("{}, {}, {}".format(suggestion.term, suggestion.distance,
                                  suggestion.count))
    if (len(suggestions) > 0):
        return suggestions[0].term
    else:
        print("error with : ", texte)
        return texte
コード例 #4
0
def correctly_spelled(data, max_edit_distance_lookup=None):
    global sym_speller  # Make the SymspellPy-based speller global to be able to be used in the body of this function
    if sym_speller is None:  # If the speller is not initialized
        sym_speller = SymSpell(
            max_edit_distance_dictionary,
            prefix_length)  # Initialize the speller provided its parameters as
        # previously defined
        sym_spell_dict_path = os.path.join(os.path.dirname(__file__),
                                           "frequency_dictionary_en_82_765.txt"
                                           )  # Load the frequency dictionary
        # to the speller
        term_index = 0  # Column of the term in the dictionary text file
        count_index = 1  # Column of the term frequency in the dictionary text file
        if not sym_speller.load_dictionary(
                sym_spell_dict_path, term_index,
                count_index):  # If the dictionary was not found
            print("ERROR! SymSpellPy dictionary not found at following path:",
                  sym_spell_dict_path
                  )  # Print error message informing about this
            os._exit(1)  # Exit the entire program

    if max_edit_distance_lookup is None:  # If no maximum edit distance during lookup is specified
        max_edit_distance_lookup = max_edit_distance_dictionary  # Assign the same edit distance to that as to the maximum edit distance
        # on the dictionary

    # Correct spelling of each token in the text and return the data sample
    return " ".join([
        (sym_speller.lookup_compound(t, max_edit_distance_lookup)[0].term if
         t.isalpha() and not (t == data[0] or t == data[1] or
                              ("".join([x[0] for x in data[1].split()]) == t if
                               len(data[1].split()) >= 3 else False)) else t)
        for t in tokenized(data[2])
    ])
コード例 #5
0
class SymSpellCorrection:
    """
        Use SymSpell for correction
    """
    def __init__(self, dictionary_path, term_index=0, count_index=1, max_edit_distance_dictionary=0, prefix_length=7, **args):
        """
        Input:
            - dictionary_path: string
            - term_index: int, column of the term in the dictionary text file, default is 0
            - count_index: int, column of the term frequency in the dictionary text file, default is 1
            - max_edit_distance_dictionary: int, maximum edit distance per dictionary precalculation, default is 0
            - prefix_length, int, default is 7
        """
        from symspellpy.symspellpy import SymSpell
        self.sym_spell = SymSpell(max_edit_distance_dictionary, prefix_length)
        self.sym_spell.load_dictionary(dictionary_path, term_index, count_index)

    def __call__(self, sentence):
        """
            Input:
                - sentence: string

            Output:
                - string
        """
        if len(sentence) < 1:
            return sentence
        try:
            corrected = self.sym_spell.word_segmentation(sentence).corrected_string
        except:
            print("Error spell correction:", sentence)
            corrected = sentence
        return corrected
コード例 #6
0
 def __init__(self, lm, max_ed=4, prefix_length=7, l=1, channel_method_poisson=True, channel_prob_param=0.02):
     self.show_progress = False
     self.lm = lm
     self.l = l
     self.channel_method_poisson = channel_method_poisson
     self.channel_prob_param = channel_prob_param
     
     self.sym_spell = SymSpell(max_ed, prefix_length)
     
     if isinstance(self.lm, GPT2LMHeadModel):
         self.device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
         self.lm_sent_logscore = self.gpt2_sent_logscore
         self.beam_init = self.beam_GPT_init
         self.skipstart = 1
         self.skipend = -1
         self.update_sentence_history = self.updateGPT2history
         self.tokenizer = GPT2Tokenizer.from_pretrained('gpt2')
         for subword in range(self.tokenizer.vocab_size):
             self.sym_spell.create_dictionary_entry(key=self.tokenizer.decode(subword), count=1)
     else:
         self.lm_sent_logscore = self.ngram_sent_logscore
         self.beam_init = self.beam_ngram_init
         self.skipstart = self.lm.order-1
         self.skipend = None
         self.update_sentence_history = self.updatengramhistory
         self.tokenizer = ngramTokenizer(self.lm)
         for word in lm.vocab:
             self.sym_spell.create_dictionary_entry(key=word, count=self.lm.counts[word])
コード例 #7
0
def main():
    # maximum edit distance per dictionary precalculation
    max_edit_distance_dictionary = 3
    prefix_length = 7
    # create object
    sym_spell = SymSpell(max_edit_distance_dictionary, prefix_length)
    # load dictionary
    dictionary_path = os.path.join(os.path.dirname(__file__),
                                   "frequency_dictionary_en_82_765.txt")
    term_index = 0  # column of the term in the dictionary text file
    count_index = 1  # column of the term frequency in the dictionary text file
    if not sym_spell.load_dictionary(dictionary_path, term_index, count_index):
        print("Dictionary file not found")
        return

    # max edit distance per lookup (per single word, not per whole input string)
    max_edit_distance_lookup = 3
    f = open("note.html", "r")
    noteString = f.read()
    noteString = stripHTML(noteString)
    print(noteString)
    input_term = ("whereis th elove hehad dated forImuch of thepast who "
                  "couqdn'tread in sixtgrade and ins pired him. But who aree yooui to say its not. I am.")

    tstart = datetime.now()
    suggestions = sym_spell.lookup_compound(noteString,
                                            max_edit_distance_lookup)
    # display suggestion term, edit distance, and term frequency
    for suggestion in suggestions:
        print("{}, {}, {}".format(suggestion.term, suggestion.distance,
                                  suggestion.count))

    tend = datetime.now()
    time = tend - tstart
    print(time.seconds)
コード例 #8
0
ファイル: methods.py プロジェクト: wfearn/preprocessing-paper
def spelling_preprocessor():
    import os
    from symspellpy.symspellpy import SymSpell, Verbosity

    max_edit_distance_dictionary = 2
    prefix_length = 7

    sc = SymSpell(max_edit_distance_dictionary, prefix_length)
    dictionary_path = os.path.join(os.getenv('HOME'), 'symspellpy/symspellpy/frequency_dictionary_en_82_765.txt')
    term_index = 0
    count_index = 1

    if not sc.load_dictionary(dictionary_path, term_index, count_index):
        raise ImportError('Unable to load spelling dictionary')

    max_edit_distance_lookup = 2
    suggestion_verbosity = Verbosity.CLOSEST

    @string_check
    def checker(s):
        words = s.split()
        corrected_words = list()

        for word in words:
            correction = sc.lookup(word, suggestion_verbosity, max_edit_distance_lookup)
            if correction:
                corrected_words.append(correction[0].term)
            else:
                corrected_words.append(word)
        return ' '.join(corrected_words)
    return checker
コード例 #9
0
    def __init__(self,
                 max_dictionary_edit_distance=2,
                 prefix_length=7,
                 dictionary_path=None):
        # maximum edit-distance for doing lookups
        self.max_dictionary_edit_distance = max_dictionary_edit_distance

        # Length of word prefixes used for spell checking
        self.prefix_length = prefix_length

        # create object
        self.sym_spell = SymSpell(
            max_dictionary_edit_distance=self.max_dictionary_edit_distance,
            prefix_length=self.prefix_length)

        # load dictionary
        if dictionary_path is None:
            dictionary_path = os.path.join(
                os.path.dirname('__file__'),
                "frequency_dictionary_en_82_765.txt")

        term_index = 0  # column of the term in the dictionary text file
        count_index = 1  # column of the term frequency in the dictionary text file

        if not self.sym_spell.load_dictionary(dictionary_path, term_index,
                                              count_index):
            print('Dictionary file not found')
コード例 #10
0
def initializeSymspell():
    print("inside initializeSymspell()")
    symspell = SymSpell(max_dictionary_edit_distance=2, prefix_length=7)
    print("symspell created")
    resourceNames = [
        "symspellpy", "frequency_dictionary_en_82_765.txt",
        "frequency_bigramdictionary_en_243_342.txt"
    ]
    dictionaryPath = pkg_resources.resource_filename(resourceNames[0],
                                                     resourceNames[1])
    bigramPath = pkg_resources.resource_filename(resourceNames[0],
                                                 resourceNames[2])
    print("dictionaryPath created")
    symspell.load_dictionary(dictionaryPath, 0, 1)
    symspell.create_dictionary_entry(key='ap', count=500000000)
    symspell.create_dictionary_entry(key="ain't", count=500000000)
    print(list(islice(symspell.words.items(), 5)))
    print("symspell.load_ditionary() done")
    symspell.load_bigram_dictionary(bigramPath, 0, 1)
    print(list(islice(symspell.bigrams.items(), 5)))
    print("symspell.load_bigram_ditionary() done")

    # Create vocab
    vocab = set([w for w, f in symspell.words.items()])

    return symspell, vocab
コード例 #11
0
ファイル: spell.py プロジェクト: justinphan3110/Malaya
def symspell(max_edit_distance_dictionary: int = 2,
             prefix_length: int = 7,
             term_index: int = 0,
             count_index: int = 1,
             top_k: int = 10,
             **kwargs):
    """
    Train a symspell Spell Corrector.

    Returns
    -------
    result: malaya.spell.SYMSPELL class
    """

    check_file(PATH_NGRAM['symspell'], S3_PATH_NGRAM['symspell'], **kwargs)
    check_file(PATH_NGRAM[1], S3_PATH_NGRAM[1], **kwargs)

    try:
        from symspellpy.symspellpy import SymSpell, Verbosity
    except:
        raise Exception(
            'symspellpy not installed. Please install it and try again.')
    sym_spell = SymSpell(max_edit_distance_dictionary, prefix_length)
    dictionary_path = PATH_NGRAM['symspell']['model']
    sym_spell.load_dictionary(dictionary_path, term_index, count_index)
    with open(PATH_NGRAM[1]['model']) as fopen:
        corpus = json.load(fopen)
    return SYMSPELL(sym_spell, Verbosity.ALL, corpus, k=top_k)
コード例 #12
0
ファイル: spellcheck.py プロジェクト: folagit/examples
def setup(initial_capacity=83000,
          prefix_length=7,
          max_edit_distance_dictionary=2):

    global maximum_edit_distance
    maximum_edit_distance = max_edit_distance_dictionary

    dict_path = '/home/fa6/data/symspellpy/frequency_dictionary_en_82_765.txt'
    sym_spell = SymSpell(initial_capacity,
                         max_edit_distance_dictionary,
                         prefix_length,
                         count_threshold=30)

    term_index = 0  # column of the term in the dictionary text file
    count_index = 1  # column of the term frequency in the dictionary text file

    if not sym_spell.load_dictionary(dict_path, term_index, count_index):
        print("Dictionary file not found")
        return

    # lookup suggestions for single-word input strings
    # input_term = "memebers"  # misspelling of "members"
    # max edit distance per lookup
    # (max_edit_distance_lookup <= max_edit_distance_dictionary)
    # max_edit_distance_lookup = 2
    # suggestion_verbosity = Verbosity.CLOSEST  # TOP, CLOSEST, ALL
    # suggestions = sym_spell.lookup(input_term, suggestion_verbosity,
    #                                max_edit_distance_lookup)
    # # display suggestion term, term frequency, and edit distance
    # for suggestion in suggestions:
    #     print("{}, {}, {}".format(suggestion.term, suggestion.count,
    #                               suggestion.distance))

    return sym_spell
コード例 #13
0
ファイル: spell.py プロジェクト: huseinzol05/malaya
def symspell(max_edit_distance_dictionary: int = 2,
             prefix_length: int = 7,
             term_index: int = 0,
             count_index: int = 1,
             top_k: int = 10,
             **kwargs):
    """
    Load a symspell Spell Corrector for Malay.

    Returns
    -------
    result: malaya.spell.Symspell class
    """

    try:
        from symspellpy.symspellpy import SymSpell, Verbosity
    except BaseException:
        raise ModuleNotFoundError(
            'symspellpy not installed. Please install it and try again.')

    path = check_file(PATH_NGRAM['symspell'], S3_PATH_NGRAM['symspell'],
                      **kwargs)
    sym_spell = SymSpell(max_edit_distance_dictionary, prefix_length)
    sym_spell.load_dictionary(path['model'], term_index, count_index)

    path = check_file(PATH_NGRAM[1], S3_PATH_NGRAM[1], **kwargs)
    with open(path['model']) as fopen:
        corpus = json.load(fopen)
    return Symspell(sym_spell, Verbosity.ALL, corpus, k=top_k)
コード例 #14
0
def create_context_speller():
  """Creates a context speller, which uses the context frequency lookup table"""

  # Initialize Context Symspell Checker
  context_sym_spell = SymSpell(83000, 2, 7)

  # load dictionary
  lookup_path = os.path.join(os.path.dirname(
      __file__), "./data/dict/context_dist_small.txt")

  if not context_sym_spell.load_dictionary(lookup_path, 0, 1):
    raise Exception("Dictionary file not found")

  # Creates the spell checker
  def check_spell(word): 
    suggestions = context_sym_spell.lookup(word, Verbosity.CLOSEST, 2)
    if len(suggestions) == 0:
      # Not in context
      return True
    else:
      correct = True
      for suggestion in suggestions:
        if suggestion.distance == 1:
          correct = False
        
      return correct
  
  return check_spell
コード例 #15
0
def spelling_correction(data,column):
    from symspellpy.symspellpy import SymSpell , Verbosity
    # maximum edit distance per dictionary precalculation
    max_edit_distance_dictionary = 2
    prefix_length = 7
    # create object
    sym_spell = SymSpell(max_edit_distance_dictionary, prefix_length)
    # load dictionary
    dictionary_path = "frequency_dictionary_en_82_765.txt"
    term_index = 0  # column of the term in the dictionary text file
    count_index = 1  # column of the term frequency in the dictionary text file
    if not sym_spell.load_dictionary(dictionary_path, term_index, count_index):
       print("Dictionary file not found")

    max_edit_distance_lookup = 2
    suggestion_verbosity = Verbosity.CLOSEST  # TOP, CLOSEST, ALL
    df_final = pd.DataFrame()
    for index , row in data.iterrows():
        # lookup suggestions for single-word input strings
        text = row[column]
        # max edit distance per lookup
        # (max_edit_distance_lookup <= max_edit_distance_dictionary)
        for input_term in text.split():
            suggestions = sym_spell.lookup(input_term, suggestion_verbosity,
                                       max_edit_distance_lookup)
            if len(suggestions)>0:
                df_local = pd.DataFrame({'Original Word':[input_term],'Replacement':[suggestions[0].term]})        
                df_final = df_final.append(df_local)
    return df_final
コード例 #16
0
    def test_words_from_list_with_shared_prefix_should_retain_counts(self):
        print('  - %s' % inspect.stack()[0][3])
        sym_spell = SymSpell(16,
                             1,
                             3,
                             words=[
                                 "pipe", "pipe", "pipe", "pipe", "pipe",
                                 "pips", "pips", "pips", "pips", "pips",
                                 "pips", "pips", "pips", "pips", "pips"
                             ])

        result = sym_spell.lookup("pipe", Verbosity.ALL, 1)
        self.assertEqual(2, len(result))
        self.assertEqual("pipe", result[0].term)
        self.assertEqual(5, result[0].count)
        self.assertEqual("pips", result[1].term)
        self.assertEqual(10, result[1].count)

        result = sym_spell.lookup("pips", Verbosity.ALL, 1)
        self.assertEqual(2, len(result))
        self.assertEqual("pips", result[0].term)
        self.assertEqual(10, result[0].count)
        self.assertEqual("pipe", result[1].term)
        self.assertEqual(5, result[1].count)

        result = sym_spell.lookup("pip", Verbosity.ALL, 1)
        self.assertEqual(2, len(result))
        self.assertEqual("pips", result[0].term)
        self.assertEqual(10, result[0].count)
        self.assertEqual("pipe", result[1].term)
        self.assertEqual(5, result[1].count)
コード例 #17
0
def main():
    # maximum edit distance per dictionary precalculation
    max_edit_distance_dictionary = 2
    prefix_length = 7

    # create object
    sym_spell = SymSpell(max_edit_distance_dictionary, prefix_length)

    # load dictionary
    dictionary_path = os.path.join(os.path.dirname(__file__),
                                   "corpus/dictionary/dictionary.txt")
    # dictionary_path = os.path.join(os.path.dirname(__file__), "corpus/symspellpy/frequency_dictionary_en_82_765.txt")
    term_index = 0  # column of the term in the dictionary text file
    count_index = 1  # column of the term frequency in the dictionary text file
    if not sym_spell.load_dictionary(dictionary_path, term_index, count_index):
        print("Dictionary file not found")
        return

    # lookup suggestions for single-word input strings
    input_term = "bangeeet"  # misspelling
    # max edit distance per lookup
    # (max_edit_distance_lookup <= max_edit_distance_dictionary)
    max_edit_distance_lookup = 2
    suggestion_verbosity = Verbosity.CLOSEST  # TOP, CLOSEST, ALL
    suggestions = sym_spell.lookup(input_term, suggestion_verbosity,
                                   max_edit_distance_lookup)
    # display suggestion term, term frequency, and edit distance
    for suggestion in suggestions:
        print("{}, {}, {}".format(suggestion.term, suggestion.distance,
                                  suggestion.count))
コード例 #18
0
def correct_spelling(sentence):
    # maximum edit distance per dictionary precalculation
    max_edit_distance_dictionary = 2
    prefix_length = 5
    # create object
    sym_spell = SymSpell(max_edit_distance_dictionary, prefix_length)
    # load dictionary
    dictionary_path = os.path.join(os.path.dirname(__file__),
                                   "frequency_dictionary_en_82_765.txt")
    term_index = 0  # column of the term in the dictionary text file
    count_index = 1  # column of the term frequency in the dictionary text file
    if not sym_spell.load_dictionary(dictionary_path, term_index, count_index):
        print("Dictionary file not found")
        return
    if "&amp ;" in sentence:
        sentence = sentence.replace("&amp ;", "and")
    max_edit_distance_lookup = 2
    suggestions = sym_spell.lookup_compound(sentence, max_edit_distance_lookup)
    save = ""
    for suggestion in suggestions:
        save = suggestion.term
        #print("{}".format(save))
        break

    #if "#" in save:
    #    save = sym_spell.word_segmentation(save)

    return save
コード例 #19
0
ファイル: sentiment.py プロジェクト: shwinshaker/CS256A2
    class SpellCorrector():
        def __init__(self, max_edit_distance_dictionary=2, prefix_length=7):
            self.sym_spell = SymSpell(max_edit_distance_dictionary, prefix_length)
            # load dictionary
            dictionary_path = os.path.join(os.path.dirname('../'),
                                           "frequency_dictionary_en_82_765.txt")
            term_index = 0  # column of the term in the dictionary text file
            count_index = 1  # column of the term frequency in the dictionary text file
            if not self.sym_spell.load_dictionary(dictionary_path, term_index, count_index):
                raise("Dictionary file not found")

            # manually
            # this works. about 0.003 up
            # self.corr_dict = {"awsome": "awesome"}

        def reduce_lengthening(self, text):
            # not work
            pattern = re.compile(r"(.)\1{2,}")
            return pattern.sub(r"\1\1", text)

        def strip_punc(self, word):
            # not work
            return re.sub(r"[\-\_\.\!]$", "", word)

        def __call__(self, word):
            word = self.reduce_lengthening(word)
            # if word in self.corr_dict:
            #     word = self.corr_dict[word]
            if len(word) > 2 and "'" not in word:
                suggestions = self.sym_spell.lookup(word, Verbosity.CLOSEST, 2)
                if suggestions:
                    return suggestions[0].term
            return word
コード例 #20
0
    def __init__(self,
                 train=False,
                 save=False,
                 corpus_path=CORPUS_PATH,
                 threshold=2):

        self.slang_dict = pickle.load(
            open(
                os.path.join(os.path.dirname(__file__),
                             "pickled/_slang_words.p"), "rb"))
        self.slang_dict['dr'] = 'dari'
        self.slang_dict['k'] = 'ke'
        self.slang_dict['sc'] = 'sesar'

        if train:
            create_dictionary.main()
            self.words = self.__words(corpus_path)
            self.counter = self.__counter(self.words)
            self.model = model.LanguageModel(corpus_path=corpus_path)
        else:
            self.words = pickle.load(
                open(
                    os.path.join(os.path.dirname(__file__),
                                 "pickled/_spell_words.p"), "rb"))
            self.counter = pickle.load(
                open(
                    os.path.join(os.path.dirname(__file__),
                                 "pickled/_spell_counter.p"), "rb"))
            self.model = model.LanguageModel(load=True)

        try:
            for key in self.counter:
                if self.counter[key] <= threshold:
                    self.words.remove(key)
        except:
            pass

        self.candidates_dict = {}

        # maximum edit distance per dictionary precalculation
        max_edit_distance_dictionary = 2
        prefix_length = 7

        # create object
        self.sym_spell = SymSpell(max_edit_distance_dictionary, prefix_length)
        self.factory = StemmerFactory()
        self.stemmer = self.factory.create_stemmer()
        # load dictionary
        dictionary_path = os.path.join(os.path.dirname(__file__),
                                       "corpus/dictionary/dictionary.txt")
        # dictionary_path = os.path.join(os.path.dirname(__file__), "corpus/symspellpy/frequency_dictionary_en_82_765.txt")
        term_index = 0  # column of the term in the dictionary text file
        count_index = 1  # column of the term frequency in the dictionary text file
        if not self.sym_spell.load_dictionary(
                dictionary_path, term_index, count_index, encoding="utf-8"):
            print("Dictionary file not found")
            return

        if save == True:
            self.save()
コード例 #21
0
def createSymSpell(dict='ru-100k.txt', encoding='utf-8'):
    symspell = SymSpell(max_dictionary_edit_distance=2, prefix_length=5)
    symspell.load_dictionary(dict,
                             encoding=encoding,
                             term_index=0,
                             count_index=1)
    return symspell
コード例 #22
0
ファイル: read_data.py プロジェクト: myeditha/switchsand
def getSymspellDict(direc):
    print("loading symspell object")
    sym_spell = SymSpell(83000, 2, 7)
    if not sym_spell.load_dictionary(direc, 0, 1):
        print("Dictionary file not found")

    return sym_spell
コード例 #23
0
def load_symspell(dict_path='symspell/frequency_dictionary_en_82_765.txt',
                  max_edit_distance_dictionary=2,
                  prefix_length=7,
                  term_index=0,
                  count_index=1):
    sym_spell = SymSpell(max_edit_distance_dictionary, prefix_length)
    sym_spell.load_dictionary(dict_path, term_index, count_index)
    return sym_spell
コード例 #24
0
ファイル: SpellCheck.py プロジェクト: johnbickmore/GeoFinder
 def __init__(self, progress, directory, countries_dict):
     self.progress = progress
     self.logger = logging.getLogger(__name__)
     self.spelling_update = Counter()
     self.directory = directory
     self.spell_path = os.path.join(self.directory, 'spelling.pkl')
     self.countries_dict = countries_dict
     self.sym_spell = SymSpell()
コード例 #25
0
def symspell_dict(max_edit_dist, prefix_len):
    dictfile = DICT_DIR / "big.txt"  #downloaded from Peter Norvig's site
    sym_spell = SymSpell(max_edit_dist, prefix_len)

    #create the symspell dictionary using the dictfile
    if not sym_spell.create_dictionary(str(dictfile)):
        print("corpus file not found")
    return sym_spell
コード例 #26
0
def main():
    # maximum edit distance per dictionary precalculation
    max_edit_distance_dictionary = 2
    prefix_length = 9
    # data = pd.read_csv('D:/ML/QNA_project/CSV_files/final_words_total_rd2.csv')

    sym_spell = SymSpell(max_edit_distance_dictionary, prefix_length)

    dictionary_path = os.path.join(os.path.dirname(__file__),
                                   "dictionary_final.txt")
    term_index = 0  # column of the term in the dictionary text file
    count_index = 1  #
    if not sym_spell.load_dictionary(dictionary_path, term_index, count_index):
        print("Dictionary file not found")
        return
    # lookup suggestions for single-word input strings

    # input_term = "agricultr"  # misspelling of "members"
    # max edit distance per lookup
    # (max_edit_distance_lookup <= max_edit_distance_dictionary)
    max_edit_distance_lookup = 2

    suggestion_verbosity = Verbosity.CLOSEST  # TOP, CLOSEST, ALL
    s = ""
    # print('original')
    # print(len(words))
    # for i in range(len(data)):
    #     # print(i)
    #     if i==0 or i==51124 or i==65070:
    #         continue
    #     input_term = data['Final_words'][i]
    #     suggestions = sym_spell.lookup(input_term, suggestion_verbosity,
    #                                max_edit_distance_lookup)
    #     print(i)
    #     try:
    #         s = s + str(suggestions[0].term)+" "
    #     except:
    #         s = s+ input_term
    #
    # s = s[:-1]
    # words = s.split(' ')
    # # print(len(words))
    # print('After')
    # print(len(words))
    # for suggestion in suggestions:
    #     print("{}, {}, {}".format(suggestion.term, suggestion.distance,
    #                               suggestion.count))

    # input_term = ("whereis th elove hehad dated forImuch of thepast who "
    #               "couqdn'tread in sixtgrade and ins pired him")
    input_term = 'live'
    # max_edit_distance_lookup = 2
    suggestions = sym_spell.lookup_compound(input_term,
                                            max_edit_distance_lookup)
    for suggestion in suggestions:
        print("{}, {}, {}".format(suggestion.term, suggestion.distance,
                                  suggestion.count))
コード例 #27
0
ファイル: sentiment.py プロジェクト: shwinshaker/CS256A2
 def __init__(self, max_edit_distance_dictionary=2, prefix_length=7):
     self.sym_spell = SymSpell(max_edit_distance_dictionary, prefix_length)
     # load dictionary
     dictionary_path = os.path.join(os.path.dirname('../'),
                                    "frequency_dictionary_en_82_765.txt")
     term_index = 0  # column of the term in the dictionary text file
     count_index = 1  # column of the term frequency in the dictionary text file
     if not self.sym_spell.load_dictionary(dictionary_path, term_index, count_index):
         raise("Dictionary file not found")
コード例 #28
0
def symspell_test(tokenpos_list,
                  max_edit_distance_lookup=3,
                  initial_capacity=83000,
                  max_edit_distance_dictionary=3,
                  prefix_length=7,
                  term_index=0,
                  count_index=1):
    """
    This is a function that tests the SymSpell library for spell-checking performance.
    Key-word arguments are:
        ** max_edit_distance_lookup : (Recommended maximum = 3)
        ** term_index : term column in dictionary (0)
        ** count_index : frequency column in dictionary (1)
    """
    print('\n{} \nBegin \'Symspellpy\' testing \n'.format('#' * 20))

    try:
        sym_spell = SymSpell(initial_capacity, max_edit_distance_dictionary,
                             prefix_length)
        suggestion_verbosity = Verbosity.CLOSEST

        dictionary_path = os.path.join(os.path.dirname(__file__),
                                       "frequency_dictionary_en_82_765.txt")
        if not sym_spell.load_dictionary(dictionary_path, term_index,
                                         count_index):
            print("Dictionary file not found")
            return 'Error loading dictionary file'
        suggestion_list = []
        proper_noun = []

        for (word, pos) in tokenpos_list:
            if pos == 'PROPN':
                suggestion_list.append(word)
                proper_noun.append(word)
            elif len(word) < 3:
                suggestion_list.append(word)
                proper_noun.append(word)
            else:
                suggestions = sym_spell.lookup(word, suggestion_verbosity,
                                               max_edit_distance_lookup)
                suggestion = (list(suggestions))[0]
                # display suggestion term, term frequency, and edit distance
                print(
                    "input_term = {}, suggestion_term = {}, suggestion_count = {},\
                suggestion_distance =  {}".format(word, suggestion.term,
                                                  suggestion.count,
                                                  suggestion.distance))
                suggestion_list.append(suggestion.term)
        print("\n\nThe corrected sentence is : {}".format(
            ' '.join(suggestion_list)))
        print(suggestion_list)
        print(proper_noun)
        return suggestion_list, proper_noun
    except TypeError as error:
        print(f'Invalid type : {error}')
        return 405
コード例 #29
0
    def test_words_with_shared_prefix_should_retain_counts(self):
        print('  - %s' % inspect.stack()[0][3])
        sym_spell = SymSpell(16, 1, 3)
        sym_spell.create_dictionary_entry("pipe", 5)
        sym_spell.create_dictionary_entry("pips", 10)

        result = sym_spell.lookup("pipe", Verbosity.ALL, 1)
        self.assertEqual(2, len(result))
        self.assertEqual("pipe", result[0].term)
        self.assertEqual(5, result[0].count)
        self.assertEqual("pips", result[1].term)
        self.assertEqual(10, result[1].count)

        result = sym_spell.lookup("pips", Verbosity.ALL, 1)
        self.assertEqual(2, len(result))
        self.assertEqual("pips", result[0].term)
        self.assertEqual(10, result[0].count)
        self.assertEqual("pipe", result[1].term)
        self.assertEqual(5, result[1].count)

        result = sym_spell.lookup("pip", Verbosity.ALL, 1)
        self.assertEqual(2, len(result))
        self.assertEqual("pips", result[0].term)
        self.assertEqual(10, result[0].count)
        self.assertEqual("pipe", result[1].term)
        self.assertEqual(5, result[1].count)
コード例 #30
0
def main():
    initial_capacity = 83000
    # maximum edit distance per dictionary precalculation
    max_edit_distance_dictionary = 0
    prefix_length = 7
    sym_spell = SymSpell(initial_capacity, max_edit_distance_dictionary,
                         prefix_length)
    # load dictionary
    dictionary_path = os.path.join(
        os.path.dirname(__file__), "./data/frequency_dictionary_en_82_765.txt")
    term_index = 0  # column of the term in the dictionary text file
    count_index = 1  # column of the term frequency in the dictionary text file
    if not sym_spell.load_dictionary(dictionary_path, term_index, count_index):
        print("Dictionary file not found")
        return

    edit_distance_max = 0
    prefix_length = 7
    sym_spell = SymSpell(83000, edit_distance_max, prefix_length)
    sym_spell.load_dictionary(dictionary_path, 0, 1)

    typo = "thequickbrownfoxjumpsoverthelazydog"
    correction = "the quick brown fox jumps over the lazy dog"
    result = sym_spell.word_segmentation(typo)  # create object

    # a sentence without any spaces
    input_term = "thequickbrownfoxjumpsoverthelazydog"
    result = sym_spell.word_segmentation(input_term)
    # display suggestion term, term frequency, and edit distance
    print("{}, {}, {}".format(result.corrected_string, result.distance_sum,
                              result.log_prob_sum))
コード例 #31
0
ファイル: add-exercises_2md.py プロジェクト: edsomjr/TEP
def main(argv):
    if len(argv) == 3:
        input = argv[1]
        markdown = argv[2]
    else:
        print ('usage:\n    python .py "<categoria>" <markdown gerado>')
        return
    initial_capacity = 83000
    max_edit_distance_dictionary = 3
    prefix_length = 7
    sym_spell = SymSpell(initial_capacity, max_edit_distance_dictionary, prefix_length)
    dictionary_path = "category_count.txt"
    term_index = 0  # column of the term in the dictionary text file
    count_index = 1  # column of the term frequency in the dictionary text file

    if not sym_spell.load_dictionary(dictionary_path, term_index, count_index):
        print("Dictionary file not found")
        return

    categorys = open(dictionary_path, 'r')
    d = defaultdict(lambda: 0)
    for x in categorys.readlines():
        z = x.split(' ')
        d[z[0]] = z[2]

    f = open(markdown, 'a')
    f.write('\n## Lista de Exercicios - %s\n' % (input).capitalize())
    input = input.lower()
    suggestion_verbosity = Verbosity.CLOSEST  # TOP, CLOSEST, ALL
    inputs = input.split(' ')
    total_avg = sum( map(len, inputs) ) / len(inputs)

    max_edit_distance_lookup = 3 if total_avg > 4 else 2
    for input_term in inputs:
        suggestions = sym_spell.lookup(input_term, suggestion_verbosity, max_edit_distance_lookup)
        for suggestion in suggestions:
            f.write("* {}, https://a2oj.com/{}".format((suggestion.term).capitalize(), d[suggestion.term]))

    f.close()
    categorys.close()