Exemple #1
0
    def test_lookup_compound_ignore_non_words(self):
        print('  - %s' % inspect.stack()[0][3])
        cwd = path.realpath(path.dirname(__file__))
        dictionary_path = path.realpath(
            path.join(cwd, pardir, "symspellpy",
                      "frequency_dictionary_en_82_765.txt"))

        edit_distance_max = 2
        prefix_length = 7
        sym_spell = SymSpell(83000, edit_distance_max, prefix_length)
        sym_spell.load_dictionary(dictionary_path, 0, 1)

        typo = ("whereis th elove 123 hehad dated forImuch of THEPAST who "
                "couqdn'tread in SIXTHgrade and ins pired him")
        correction = ("where is the love 123 he had dated for much of THEPAST "
                      "who couldn't read in sixth grade and inspired him")
        results = sym_spell.lookup_compound(typo, edit_distance_max, True)
        self.assertEqual(1, len(results))
        self.assertEqual(correction, results[0].term)

        typo = "in te DHIRD 1 qarter oflast jear he hadlearned ofca sekretplan"
        correction = ("in the DHIRD 1 quarter of last year he had learned "
                      "of a secret plan")
        results = sym_spell.lookup_compound(typo, edit_distance_max, True)
        self.assertEqual(1, len(results))
        self.assertEqual(correction, results[0].term)

        typo = ("the bigjest playrs in te stroGSOmmer film slatew ith PLETY "
                "of 12 funn")
        correction = ("the biggest players in the strong summer film slate "
                      "with PLETY of 12 fun")
        results = sym_spell.lookup_compound(typo, edit_distance_max, True)
        self.assertEqual(1, len(results))
        self.assertEqual(correction, results[0].term)

        typo = ("Can yu readtHIS messa ge despite thehorible 1234 "
                "sppelingmsitakes")
        correction = ("can you read this message despite the horrible 1234 "
                      "spelling mistakes")
        results = sym_spell.lookup_compound(typo, edit_distance_max, True)
        self.assertEqual(1, len(results))
        self.assertEqual(correction, results[0].term)

        typo = "PI on leave, arrange Co-I to do screening"
        correction = "PI on leave arrange co i to do screening"
        results = sym_spell.lookup_compound(typo, edit_distance_max, True)
        self.assertEqual(1, len(results))
        self.assertEqual(correction, results[0].term)
Exemple #2
0
def correct_spelling(sentence):
    # maximum edit distance per dictionary precalculation
    max_edit_distance_dictionary = 2
    prefix_length = 5
    # create object
    sym_spell = SymSpell(max_edit_distance_dictionary, prefix_length)
    # load dictionary
    dictionary_path = os.path.join(os.path.dirname(__file__),
                                   "frequency_dictionary_en_82_765.txt")
    term_index = 0  # column of the term in the dictionary text file
    count_index = 1  # column of the term frequency in the dictionary text file
    if not sym_spell.load_dictionary(dictionary_path, term_index, count_index):
        print("Dictionary file not found")
        return
    if "&amp ;" in sentence:
        sentence = sentence.replace("&amp ;", "and")
    max_edit_distance_lookup = 2
    suggestions = sym_spell.lookup_compound(sentence, max_edit_distance_lookup)
    save = ""
    for suggestion in suggestions:
        save = suggestion.term
        #print("{}".format(save))
        break

    #if "#" in save:
    #    save = sym_spell.word_segmentation(save)

    return save
Exemple #3
0
def spell_correction(texte):
    max_edit_distance_dictionary = 2
    prefix_length = 7
    sym_spell = SymSpell(max_edit_distance_dictionary, prefix_length)
    dictionary_path = "../ressources/fr-100k.txt"
    bigram_path = pkg_resources.resource_filename(
        "symspellpy", "frequency_bigramdictionary_en_243_342.txt")
    if not sym_spell.load_dictionary(
            dictionary_path, term_index=0, count_index=1):
        print("Dictionary file not found")
        return
    if not sym_spell.load_bigram_dictionary(
            bigram_path, term_index=0, count_index=2):
        print("Bigram dictionary file not found")
        return
    input_term = texte
    # max edit distance per lookup (per single word, not per whole input string)
    max_edit_distance_lookup = 2
    suggestions = sym_spell.lookup_compound(input_term,
                                            max_edit_distance_lookup)
    # display suggestion term, edit distance, and term frequency
    for suggestion in suggestions:
        print("{}, {}, {}".format(suggestion.term, suggestion.distance,
                                  suggestion.count))
    if (len(suggestions) > 0):
        return suggestions[0].term
    else:
        print("error with : ", texte)
        return texte
def main():
    # maximum edit distance per dictionary precalculation
    max_edit_distance_dictionary = 3
    prefix_length = 7
    # create object
    sym_spell = SymSpell(max_edit_distance_dictionary, prefix_length)
    # load dictionary
    dictionary_path = os.path.join(os.path.dirname(__file__),
                                   "frequency_dictionary_en_82_765.txt")
    term_index = 0  # column of the term in the dictionary text file
    count_index = 1  # column of the term frequency in the dictionary text file
    if not sym_spell.load_dictionary(dictionary_path, term_index, count_index):
        print("Dictionary file not found")
        return

    # max edit distance per lookup (per single word, not per whole input string)
    max_edit_distance_lookup = 3
    f = open("note.html", "r")
    noteString = f.read()
    noteString = stripHTML(noteString)
    print(noteString)
    input_term = ("whereis th elove hehad dated forImuch of thepast who "
                  "couqdn'tread in sixtgrade and ins pired him. But who aree yooui to say its not. I am.")

    tstart = datetime.now()
    suggestions = sym_spell.lookup_compound(noteString,
                                            max_edit_distance_lookup)
    # display suggestion term, edit distance, and term frequency
    for suggestion in suggestions:
        print("{}, {}, {}".format(suggestion.term, suggestion.distance,
                                  suggestion.count))

    tend = datetime.now()
    time = tend - tstart
    print(time.seconds)
Exemple #5
0
def main():
    # maximum edit distance per dictionary precalculation
    max_edit_distance_dictionary = 2
    prefix_length = 7
    # create object
    sym_spell = SymSpell(max_edit_distance_dictionary, prefix_length)
    # load dictionary
    dictionary_path = os.path.join(os.path.dirname(__file__),
                                   "frequency_dictionary_en_82_765.txt")
    term_index = 0  # column of the term in the dictionary text file
    count_index = 1  # column of the term frequency in the dictionary text file
    if not sym_spell.load_dictionary(dictionary_path, term_index, count_index):
        print("Dictionary file not found")
        return

    sym_spell.load_dictionary(
        "/home/yadi/projectDISK/Python-Projects/ML-NLP/dictionary.txt", 0, 1)

    # lookup suggestions for multi-word input strings (supports compound
    # splitting & merging)
    input_term = ("whereis th elove hehad dated forImuch of thepast who "
                  "couqdn'tread in sixtgrade and ins pired him."
                  "I'm workig in th e yadolah shahrary working in githib")
    # max edit distance per lookup (per single word, not per whole input string)
    max_edit_distance_lookup = 1
    suggestions = sym_spell.lookup_compound(input_term,
                                            max_edit_distance_lookup,
                                            transfer_casing=True)
    # display suggestion term, edit distance, and term frequency
    print(input_term)
    for suggestion in suggestions:
        print("{}".format(suggestion.term))
Exemple #6
0
def correctly_spelled(data, max_edit_distance_lookup=None):
    global sym_speller  # Make the SymspellPy-based speller global to be able to be used in the body of this function
    if sym_speller is None:  # If the speller is not initialized
        sym_speller = SymSpell(
            max_edit_distance_dictionary,
            prefix_length)  # Initialize the speller provided its parameters as
        # previously defined
        sym_spell_dict_path = os.path.join(os.path.dirname(__file__),
                                           "frequency_dictionary_en_82_765.txt"
                                           )  # Load the frequency dictionary
        # to the speller
        term_index = 0  # Column of the term in the dictionary text file
        count_index = 1  # Column of the term frequency in the dictionary text file
        if not sym_speller.load_dictionary(
                sym_spell_dict_path, term_index,
                count_index):  # If the dictionary was not found
            print("ERROR! SymSpellPy dictionary not found at following path:",
                  sym_spell_dict_path
                  )  # Print error message informing about this
            os._exit(1)  # Exit the entire program

    if max_edit_distance_lookup is None:  # If no maximum edit distance during lookup is specified
        max_edit_distance_lookup = max_edit_distance_dictionary  # Assign the same edit distance to that as to the maximum edit distance
        # on the dictionary

    # Correct spelling of each token in the text and return the data sample
    return " ".join([
        (sym_speller.lookup_compound(t, max_edit_distance_lookup)[0].term if
         t.isalpha() and not (t == data[0] or t == data[1] or
                              ("".join([x[0] for x in data[1].split()]) == t if
                               len(data[1].split()) >= 3 else False)) else t)
        for t in tokenized(data[2])
    ])
def main():
    # maximum edit distance per dictionary precalculation
    max_edit_distance_dictionary = 2
    prefix_length = 9
    # data = pd.read_csv('D:/ML/QNA_project/CSV_files/final_words_total_rd2.csv')

    sym_spell = SymSpell(max_edit_distance_dictionary, prefix_length)

    dictionary_path = os.path.join(os.path.dirname(__file__),
                                   "dictionary_final.txt")
    term_index = 0  # column of the term in the dictionary text file
    count_index = 1  #
    if not sym_spell.load_dictionary(dictionary_path, term_index, count_index):
        print("Dictionary file not found")
        return
    # lookup suggestions for single-word input strings

    # input_term = "agricultr"  # misspelling of "members"
    # max edit distance per lookup
    # (max_edit_distance_lookup <= max_edit_distance_dictionary)
    max_edit_distance_lookup = 2

    suggestion_verbosity = Verbosity.CLOSEST  # TOP, CLOSEST, ALL
    s = ""
    # print('original')
    # print(len(words))
    # for i in range(len(data)):
    #     # print(i)
    #     if i==0 or i==51124 or i==65070:
    #         continue
    #     input_term = data['Final_words'][i]
    #     suggestions = sym_spell.lookup(input_term, suggestion_verbosity,
    #                                max_edit_distance_lookup)
    #     print(i)
    #     try:
    #         s = s + str(suggestions[0].term)+" "
    #     except:
    #         s = s+ input_term
    #
    # s = s[:-1]
    # words = s.split(' ')
    # # print(len(words))
    # print('After')
    # print(len(words))
    # for suggestion in suggestions:
    #     print("{}, {}, {}".format(suggestion.term, suggestion.distance,
    #                               suggestion.count))

    # input_term = ("whereis th elove hehad dated forImuch of thepast who "
    #               "couqdn'tread in sixtgrade and ins pired him")
    input_term = 'live'
    # max_edit_distance_lookup = 2
    suggestions = sym_spell.lookup_compound(input_term,
                                            max_edit_distance_lookup)
    for suggestion in suggestions:
        print("{}, {}, {}".format(suggestion.term, suggestion.distance,
                                  suggestion.count))
def symspell_checker(text):
    from symspellpy.symspellpy import SymSpell
    spell = SymSpell()
    spell.load_dictionary(r"frequency_dictionary_en_82_765.txt", 0, 1)
    spell.load_bigram_dictionary(r"frequency_bigramdictionary_en_243_342.txt", 0, 2)
    result = spell.lookup_compound(text, 2)
    for r in result:
        return r.term
    return text    
Exemple #9
0
    def test_lookup_compound(self):
        print('  - %s' % inspect.stack()[0][3])
        cwd = path.realpath(path.dirname(__file__))
        dictionary_path = path.realpath(
            path.join(cwd, pardir, "symspellpy",
                      "frequency_dictionary_en_82_765.txt"))

        edit_distance_max = 2
        prefix_length = 7
        sym_spell = SymSpell(83000, edit_distance_max, prefix_length)
        sym_spell.load_dictionary(dictionary_path, 0, 1)

        typo = ("whereis th elove hehad dated forImuch of thepast who "
                "couqdn'tread in sixthgrade and ins pired him")
        correction = ("where is the love he had dated for much of the past "
                      "who couldn't read in sixth grade and inspired him")
        results = sym_spell.lookup_compound(typo, edit_distance_max)
        self.assertEqual(1, len(results))
        self.assertEqual(correction, results[0].term)

        typo = "in te dhird qarter oflast jear he hadlearned ofca sekretplan"
        correction = ("in the third quarter of last year he had learned of a "
                      "secret plan")
        results = sym_spell.lookup_compound(typo, edit_distance_max)
        self.assertEqual(1, len(results))
        self.assertEqual(correction, results[0].term)

        typo = ("the bigjest playrs in te strogsommer film slatew ith plety "
                "of funn")
        correction = ("the biggest players in the strong summer film slate "
                      "with plenty of fun")
        results = sym_spell.lookup_compound(typo, edit_distance_max)
        self.assertEqual(1, len(results))
        self.assertEqual(correction, results[0].term)

        typo = ("Can yu readthis messa ge despite thehorible sppelingmsitakes")
        correction = ("can you read this message despite the horrible "
                      "spelling mistakes")
        results = sym_spell.lookup_compound(typo, edit_distance_max)
        self.assertEqual(1, len(results))
        self.assertEqual(correction, results[0].term)
Exemple #10
0
    async def quote(self, message, args):
        msg = None
        if try_parse_int64(args[0]) is not None:
            msg_id = args[0]
            try:
                msg = await self.client.get_message(message.channel.id, msg_id)
            except Exception as exception:  # pylint: disable=W0703
                LOG.exception(exception)
        else:
            input_term = args[0]
            sym_spell = SymSpell()
            for term in input_term.split(" "):
                sym_spell.create_dictionary_entry(term, 1)
            target = sym_spell.lookup_compound(input_term, 2)[0].term
            iterator = message.channel.history(limit=100)
            for __ in range(100):
                try:
                    msg = await iterator.next()
                    suggestion = sym_spell.lookup_compound(msg.content, 2)[0]
                    if suggestion.term == target:
                        msg = await self.client.get_message(
                            message.channel.id, msg.id)
                        break
                except NoMoreItems:
                    msg = None
        if msg is not None:
            display_name = message.guild.get_member(int(
                msg["author"]["id"])).display_name
            time_str = (
                datetime.strptime(msg["timestamp"].split(".")[0],
                                  "%Y-%m-%dT%H:%M:%S") +
                timedelta(hours=TZ_OFFSET)).strftime("%Y-%m-%d %I:%M %p")
            quote_msg = "```{} - {} UTC+{}\n{}```".format(
                display_name, time_str, TZ_OFFSET, msg["content"])
        else:
            quote_msg = "Message not found!"

        await self.client.send_message(message.channel.id, quote_msg)
        await message.delete()
def main():
    # maximum edit distance per dictionary precalculation
    max_edit_distance_dictionary = 2
    prefix_length = 7
    # create object
    sym_spell = SymSpell(max_edit_distance_dictionary, prefix_length)
    
    # load dictionary
    dictionary_path = os.path.join(os.path.dirname(__file__),
                                   "frequency_dictionary_en_82_765.txt")
    term_index = 0  # column of the term in the dictionary text file
    count_index = 1  # column of the term frequency in the dictionary text file
    if not sym_spell.load_dictionary(dictionary_path, term_index, count_index):
        print("Dictionary file not found")
        return

    # lookup suggestions for single-word input strings
    input_term = "pyth"  # misspelling of "members"
    # max edit distance per lookup
    # (max_edit_distance_lookup <= max_edit_distance_dictionary)
    max_edit_distance_lookup = 2
    suggestion_verbosity = Verbosity.CLOSEST  # TOP, CLOSEST, ALL
    suggestions = sym_spell.lookup(input_term, suggestion_verbosity,
                                   max_edit_distance_lookup)
    # display suggestion term, term frequency, and edit distance
    for suggestion in suggestions:
        print("{}, {}, {}".format(suggestion.term, suggestion.distance,
                                  suggestion.count))

    # lookup suggestions for multi-word input strings (supports compound
    # splitting & merging)
    input_term = ("whereis th elove hehad dated forImuch of thepast who "
                  "couqdn'tread in sixtgrade and ins pired him")
    # max edit distance per lookup (per single word, not per whole input string)
    max_edit_distance_lookup = 2
    suggestions = sym_spell.lookup_compound(input_term,
                                            max_edit_distance_lookup)
    # display suggestion term, edit distance, and term frequency
    for suggestion in suggestions:
        print("{}, {}, {}".format(suggestion.term, suggestion.distance,
                                  suggestion.count))
    # a sentence without any spaces
    input_term = "thequuickbrownfoxjumpsoverthelazydog"
    
    result = sym_spell.word_segmentation(input_term)
    # display suggestion term, term frequency, and edit distance
    print("{}, {}, {}".format(result.corrected_string, result.distance_sum,
                              result.log_prob_sum))
Exemple #12
0
def correction(input_term):

    # create object
    sym_spell = SymSpell()
    # load dictionary
    dictionary_path = os.path.join(os.path.dirname(__file__),
                                   "frequency_dictionary_en_82_765.txt")
    term_index = 0  # column of the term in the dictionary text file
    count_index = 1  # column of the term frequency in the dictionary text file
    if not sym_spell.load_dictionary(dictionary_path, term_index, count_index):
        print("Dictionary file not found")
        return

    max_edit_distance_lookup = 2
    suggestion_verbosity = Verbosity.CLOSEST  # TOP, CLOSEST, ALL
    suggestions = sym_spell.lookup(input_term, suggestion_verbosity,
                                   max_edit_distance_lookup)

    suggestions.extend(
        sym_spell.lookup_compound(input_term, max_edit_distance_lookup))

    suggestions = sorted(suggestions, key=lambda x: (x.distance))

    #to remove dupicate objects
    import collections
    seen = collections.OrderedDict()
    for obj in suggestions:
        if obj.term not in seen:
            seen[obj.term] = obj

    suggestions = list(seen.values())

    #when the no correction is needed
    seen = collections.OrderedDict()
    for obj in suggestions:
        if obj.term != input_term:
            seen[obj.term] = obj

    correctWords = list(seen.values())
    if len(correctWords) == 0:
        return


#    for suggestion in suggestions:
#        print("{}, {}, {}".format(suggestion.term, suggestion.distance,
#                                  suggestion.count))
    return suggestions
Exemple #13
0
def main():
    # create object
    initial_capacity = 83000
    # maximum edit distance per dictionary precalculation
    max_edit_distance_dictionary = 2
    prefix_length = 7
    sym_spell = SymSpell(initial_capacity, max_edit_distance_dictionary,
                         prefix_length)
    # load dictionary
    dictionary_path = os.path.join(
        "/Users/meheresh/Documents/cm_spellchecker/spellcheck/data",
        "freqdict.txt")
    term_index = 0  # column of the term in the dictionary text file
    count_index = 1  # column of the term frequency in the dictionary text file
    if not sym_spell.load_dictionary(dictionary_path, term_index, count_index):
        print("Dictionary file not found")
        return

    # lookup suggestions for single-word input strings
    input_term = "memebers"  # misspelling of "members"
    # max edit distance per lookup
    # (max_edit_distance_lookup <= max_edit_distance_dictionary)
    max_edit_distance_lookup = 2
    suggestion_verbosity = Verbosity.CLOSEST  # TOP, CLOSEST, ALL
    suggestions = sym_spell.lookup(input_term, suggestion_verbosity,
                                   max_edit_distance_lookup)
    # display suggestion term, term frequency, and edit distance
    for suggestion in suggestions:
        print("{}, {}, {}".format(suggestion.term, suggestion.count,
                                  suggestion.distance))

    # lookup suggestions for multi-word input strings (supports compound
    # splitting & merging)
    input_term = ("whereis th elove hehad dated forImuch of thepast who "
                  "couqdn'tread in sixtgrade and ins pired him")
    # max edit distance per lookup (per single word, not per whole input string)
    max_edit_distance_lookup = 2
    suggestions = sym_spell.lookup_compound(input_term,
                                            max_edit_distance_lookup)
    # display suggestion term, edit distance, and term frequency
    for suggestion in suggestions:
        print("{}, {}, {}".format(suggestion.term, suggestion.count,
                                  suggestion.distance))
def main():
    # maximum edit distance per dictionary precalculation
    max_edit_distance_dictionary = 2
    prefix_length = 7
    # create object
    sym_spell = SymSpell(max_edit_distance_dictionary, prefix_length)
    # load dictionary
    dictionary_path = os.path.join(
        os.path.dirname(__file__),
        "/home/raghu/Downloads/frequency_dictionary_en_82_765.txt")
    term_index = 0  # column of the term in the dictionary text file
    count_index = 1  # column of the term frequency in the dictionary text file
    if not sym_spell.load_dictionary(dictionary_path, term_index, count_index):
        print("Dictionary file not found")
        return

    # lookup suggestions for single-word input strings
    input_term = "memebers"  # misspelling of "members"
    # max edit distance per lookup
    # (max_edit_distance_lookup <= max_edit_distance_dictionary)
    max_edit_distance_lookup = 2
    suggestion_verbosity = Verbosity.CLOSEST  # TOP, CLOSEST, ALL
    suggestions = sym_spell.lookup(input_term, suggestion_verbosity,
                                   max_edit_distance_lookup)
    # display suggestion term, term frequency, and edit distance
    for suggestion in suggestions:
        print("{}, {}, {}".format(suggestion.term, suggestion.distance,
                                  suggestion.count))

    # lookup suggestions for multi-word input strings (supports compound
    # splitting & merging)
    input_term = ("AGUDATA OF BIRTH")
    # max edit distance per lookup (per single word, not per whole input string)
    max_edit_distance_lookup = 2
    suggestions = sym_spell.lookup_compound(input_term,
                                            max_edit_distance_lookup)

    # display suggestion term, edit distance, and term frequency
    print(suggestions)
    for suggestion in suggestions:
        print("{}, {}, {}".format(suggestion.term, suggestion.distance,
                                  suggestion.count))
Exemple #15
0
    def _correct_spelling_errors(self):
        """
        Corrects spelling errors in tweets using symspell.
        :return:
        """
        sym_spell = SymSpell(
            Corpus.symspell_config["initial_capacity"],
            Corpus.symspell_config["max_edit_distance_dictionary"],
            Corpus.symspell_config["prefix_length"])
        config = Corpus.symspell_config

        # self._tweets_df = self._tweets_df.sample(frac=1)
        for idx, record in self._tweets_df.iterrows():
            suggestions = sym_spell.lookup_compound(
                record.text, config["max_edit_distance_lookup"])
            for suggestion in suggestions:
                print("  {}, {}, {}".format(suggestion.term, suggestion.count,
                                            suggestion.distance))

        return self._tweets_df
class SpellCorrect:
    def __init__(self,
                 max_dictionary_edit_distance=2,
                 prefix_length=7,
                 dictionary_path=None):
        # maximum edit-distance for doing lookups
        self.max_dictionary_edit_distance = max_dictionary_edit_distance

        # Length of word prefixes used for spell checking
        self.prefix_length = prefix_length

        # create object
        self.sym_spell = SymSpell(
            max_dictionary_edit_distance=self.max_dictionary_edit_distance,
            prefix_length=self.prefix_length)

        # load dictionary
        if dictionary_path is None:
            dictionary_path = os.path.join(
                os.path.dirname('__file__'),
                "frequency_dictionary_en_82_765.txt")

        term_index = 0  # column of the term in the dictionary text file
        count_index = 1  # column of the term frequency in the dictionary text file

        if not self.sym_spell.load_dictionary(dictionary_path, term_index,
                                              count_index):
            print('Dictionary file not found')

    def spelling_correct(self, input_term):
        # lookup suggestions for multi-word input strings (supports compound
        # splitting & merging)

        # max edit distance per lookup (per single word, not per whole input string)
        max_edit_distance_lookup = 2

        suggestions = self.sym_spell.lookup_compound(
            phrase=input_term, max_edit_distance=max_edit_distance_lookup)

        return "".join([suggestion.term for suggestion in suggestions])
def main():
    initial_capacity = 83000
    max_edit_distance_dictionary = 3
    prefix_length = 7
    sym_spell = SymSpell(initial_capacity, max_edit_distance_dictionary,
                         prefix_length)
    dictionary_path = "alfabeto.txt"
    term_index = 0  # column of the term in the dictionary text file
    count_index = 1  # column of the term frequency in the dictionary text file

    if not sym_spell.load_dictionary(dictionary_path, term_index, count_index):
        print("Dictionary file not found")
        return

    input_term = ("previdensia sosial é augo difisio e discitido no bra sil")
    max_edit_distance_lookup = 3
    suggestions = sym_spell.lookup_compound(input_term,
                                            max_edit_distance_lookup)

    for suggestion in suggestions:
        print("{}, {}, {}".format(suggestion.term, suggestion.count,
                                  suggestion.distance))
Exemple #18
0
class SymSpell(SpellCheck):
    def __init__(self, dictionary_file_path='', dictionary=None, verbose=0):
        super().__init__(dictionary=dictionary, verbose=verbose)

        self.dictionary_file_path = dictionary_file_path
        self.model = None

    def load_vocab(self,
                   corpus_file_path,
                   max_edit_distance_dictionary=2,
                   prefix_length=5):
        # initial_capacity = len(corpus)

        # sym_spell = SymSpellPy(
        #    initial_capacity, max_edit_distance_dictionary,
        #    prefix_length)
        self.model = SymSpellPy(
            max_dictionary_edit_distance=max_edit_distance_dictionary,
            prefix_length=prefix_length)

        term_index = 0  # column of the term in the dictionary text file
        count_index = 1  # column of the term frequency in the dictionary text file
        if not self.model.load_dictionary(corpus_file_path, term_index,
                                          count_index):
            print("Dictionary file not found")

    def build_vocab(self, dictionary, file_dir, file_name, verbose=0):
        if not os.path.exists(file_dir):
            os.makedirs(file_dir)
        """
            Data format:
                token, frequency
            Example:
                edward 154
                edwards 50
                ...
        """
        if self.verbose > 3 or verbose > 3:
            print('Size of dictionary: %d' % len(dictionary))

        with open(file_dir + file_name, "w") as text_file:
            for token, count in dictionary.items():
                text_file.write(token + ' ' + str(count))
                text_file.write('\n')

    def correction(self, word, max_edit_distance_lookup=2, mode='cloest'):
        if mode == 'cloest':
            suggestion_verbosity = Verbosity.CLOSEST
        elif mode == 'top':
            suggestion_verbosity = Verbosity.TOP
        elif mode == 'all':
            suggestion_verbosity = Verbosity.ALL

        results = self.model.lookup(word, suggestion_verbosity,
                                    max_edit_distance_lookup)

        results = [{
            'word': suggestion.term,
            'count': suggestion.count,
            'distance': suggestion.distance
        } for suggestion in results]
        return results

    def corrections(self, sentence, max_edit_distance_lookup=2):
        normalized_sentence = (sentence.lower())
        results = self.model.lookup_compound(normalized_sentence,
                                             max_edit_distance_lookup)

        results = [{
            'word': suggestion.term,
            'distance': suggestion.distance
        } for suggestion in results]
        return results
class WordSimilarity:
    def __init__(self, spell):
        max_edit_distance_dictionary = 2
        prefix_length = 7
        self.sym_spell = SymSpell(max_edit_distance_dictionary, prefix_length)
        dictionary_path = pkg_resources.resource_filename(
            "symspellpy", "frequency_dictionary_en_82_765.txt")
        bigram_path = pkg_resources.resource_filename(
            "symspellpy", "frequency_bigramdictionary_en_243_342.txt")
        if not self.sym_spell.load_dictionary(
                dictionary_path, term_index=0, count_index=1):
            print("Dictionary file not found")
            return
        if not self.sym_spell.load_bigram_dictionary(
                bigram_path, term_index=0, count_index=2):
            print("Bigram dictionary file not found")
            return

        self.nlp = spacy.load(
            "shop_recognizer/semantic_detector/models/en_core_web_lg")
        self.spell = spell

    def checkSemanticSimilarity(self, labels, words):
        result = {}
        texts = self.removeNoise2(words)
        for label in labels:
            tmp = ""
            doc1 = self.nlp(label)
            for text in texts:
                tmp += text + " "
            doc2 = self.nlp(tmp)
            score = doc2.similarity(doc1)
            result[label] = int(score * 100)
        prob = self.softmax(labels, result)
        counter = 0
        for cls in labels:
            if len(words):
                result[cls] = float(prob[counter])
                counter = counter + 1
            else:
                result[cls] = 0
        return result

    def checkSemanticSimilarity2(self, labels, words):
        result = {}
        texts = self.removeNoise2(words)
        for label in labels:
            tmp = 0
            doc1 = self.nlp(label)
            for text in texts:
                doc2 = self.nlp(text)
                similarity = doc2.similarity(doc1)
                if similarity > tmp:
                    tmp = similarity
            result[label] = int(tmp * 100)
        prob = self.softmax(labels, result)
        counter = 0
        for cls in labels:
            if len(words):
                result[cls] = float(prob[counter])
                counter = counter + 1
            else:
                result[cls] = 0
        return result

    def removeNoise(self, words):
        result = []
        for word in words:
            if len(word) > 2 and (word.isdigit() is False):
                if (word in self.nlp.Defaults.stop_words):
                    continue
                else:
                    newWord = self.spell.correction(word)
                    if self.nlp.vocab.has_vector(newWord):
                        result.append(newWord)
        return result

    def removeNoise2(self, words):
        result = []
        for word in words:
            if len(word) > 2 and (word.isdigit() is False):
                newWord = self.correct(word)
                newWord = newWord.replace(" ", "")
                result.append(newWord)
        return result

    def correct(self, word):
        input_term = (word)
        max_edit_distance_lookup = 2
        suggestions = self.sym_spell.lookup_compound(input_term,
                                                     max_edit_distance_lookup)
        return suggestions[0].term

    def softmax(self, classes, scores):
        inputArry = []
        for cls in classes:
            inputArry.append(scores[cls])
        ex = np.exp(inputArry)
        sum_ex = np.sum(np.exp(inputArry))
        return ex / sum_ex
Exemple #20
0
class Cleaner:
    """
    Cleaner object for the first type of documents
    """
    def __init__(self, directory, lexique_path, dict_path):
        """
        args:
        directory: directory where the CSV will be stored
        dict_path: path of the dictionnary
        lexique_path: path of the lexique
        """
        self.directory = directory
        self.dict_path = dict_path
        self.stopwords = list(nltk.corpus.stopwords.words('french'))
        self.lexique_path = lexique_path
        self.words = load_dictionnary(
            self.dict_path) + self.stopwords + load_lexique(self.lexique_path)
        self.corrected = {}

        self.max_edit_distance_dictionary = 2
        self.prefix_length = 7
        self.sym_spell = SymSpell(self.max_edit_distance_dictionary,
                                  self.prefix_length)
        self.dictionary_path = "../ressources/fr-100k.txt"
        self.sym_spell.load_dictionary(self.dictionary_path,
                                       term_index=0,
                                       count_index=1)
        pass

    def extract(self, file):
        """ function to extract the judgements
        args: file to extract
        return: dataframe of the judgements """
        soup = BeautifulSoup(file, "html.parser")
        df = pd.DataFrame(columns=["page", "arrêt", "date", "juridiction"])
        Decision, notes, page, new_page, new_decision, count = False, False, 0, True, False, 1
        for tag in soup.body:
            if count == 15:  #limit length od judgement to 15 paragraphs
                count = 0
                Decision = False
            new_decision = False
            string = tag.get_text()
            if tag.name == "hr":  # hr means new page
                page += 1
                notes = False
                new_page = True
            if tag.name == "p" and string is not None and not new_page:
                # pattern : start judgement
                m1 = re.match(
                    r"(^.*?(La Cour,|L(A|À|a) COUR)(?! DE)(.+)$|^J\s?U\s?G\s?E\s?M\s?E\s?N\s?T\.?\s?$|^A\s?R\s?R\s?(Ê|E)\s?T\s?\.?\s?$)",
                    string)
                # pattern: end judgement
                m2 = re.match(r"(.*?)D(u|û|ù)(.+?)(—|–|-|–|–)(.+)", string)

                if not Decision and m1:  # if new decision
                    Decision = True
                    text = ""
                    count = 1

                    if m1.groups()[3] != None:  #extract the text after la cour
                        text = str(m1.groups()[3])
                    First_page = page
                    new_decision = True

                if Decision and m2:  # case : end of judgement
                    if count < 15:
                        if (new_decision):
                            text = m2.groups()[0]
                        else:
                            text += m2.groups()[0]
                        date = m2.groups()[2]
                        juridiction = m2.groups()[4]
                        df = df.append(
                            {
                                'page': First_page,
                                'arrêt': text,
                                "date": date,
                                "juridiction": juridiction
                            },
                            ignore_index=True)
                    Decision = False
                    text = ''
                elif not notes and Decision and not new_decision:
                    if not re.match(r"^\(\d*\).+$", string):
                        count += 1
                        text += string + "\n"
                    else:
                        notes = True
                else:
                    pass
            else:
                new_page = False
        return df

    def save(self, df, ark, year):
        """ function to save the DF"""
        df.to_csv(f"{self.directory}/{year}/{ark}.csv",
                  encoding="utf-8",
                  sep=";")
        pass

    def postProcess(self, df, ark, year, recceuil):
        """ function to post process"""
        # fix mix date-juridiction
        Rows_contains_ = df['date'].str.contains(r"(—|–|-)")
        for i, row in df[Rows_contains_].iterrows():
            m = re.search(r"(.+?)(—|–|-|—)(.+)(—|–|-|—)?.*", row["date"])
            if m:
                df.at[i, "date"] = m.groups()[0]
                df.at[i, "juridiction"] = m.groups()[2]
        #if still not fixed --> drop them
        Rows_contains_ = df['date'].str.contains(r"(—|–|-)")
        df = df[Rows_contains_ == False]
        # drop date too long
        leng = df["date"].str.len()
        df = df[leng < 25]  # drop too long date
        # drop date with no number
        number = df["date"].str.contains("^\D*$")
        df = df[number == False]  # drop too long date
        length_decision = df.arrêt.str.len()
        # drop decision too short
        df = df[length_decision > 100]
        # drop juridiction too long
        for i, row in df.iterrows():
            m = re.search(r"(.+?)(—|–|-|—|,|;).*", row["juridiction"])
            if m:
                df.at[i, "juridiction"] = m.groups()[0]

        # add link
        df["lien"] = "https://gallica.bnf.fr/ark:/12148/" + ark + "/f"
        df["lien"] = df["lien"] + df.page.map(str) + ".image"

        df["id"] = "" + str(year) + str(recceuil) + df.index.map(str)
        df.index = df.id
        return df

    def spell_check(self, df):
        """ apply the spell checking on the df"""
        df["arrêt"] = df["arrêt"].apply(self.correct)
        return df

    def correct(self, text):
        """ spell check text"""
        ntokens = []
        tokens = re.split('\s|,|\.|;|—|–|-|–|–|\n|:|\!|\?', text)
        for t in tokens:
            if (str(t).lower().isalpha() and not str(t).lower() in self.words
                    and not str(t)[0].isupper()):
                if str(t) in self.corrected:
                    nt = self.corrected[t]
                else:
                    nt = t
                    suggestion = self.sym_spell.lookup_compound(t, 2)
                    if len(suggestion) > 0:
                        nt = suggestion[0].term

                    self.corrected[t] = nt
                ntokens.append(nt)

            else:
                ntokens.append(t)
        return " ".join(ntokens)
Exemple #21
0
term_index = 0  # column of the term in the dictionary text file
count_index = 1  # column of the term frequency in the dictionary text file

if not sym_spell.load_dictionary(dictionary_path, term_index,
                                 count_index):  #loading the dictionary
    print("Dictionary file not found")

placeHolderList = []

#lookup suggestions for multi-word input strings
#Conversion is needed. The sym spell lookup-compound takes string
input_corpus = (str(Corpus))

# max edit distance per lookup (per single word, not per whole input string)
max_edit_distance_lookup = 2
suggestions = sym_spell.lookup_compound(input_corpus, max_edit_distance_lookup)
#Putting everything back in a list format. 'PlaceHolderList' is temporary
for suggestion in suggestions:
    placeHolderList.append(suggestion.term)

stopWords = set(
    stopwords.words('english'))  #getting stop wards to clean up the corpus
#Tokenize made everything separate in a list. Using the 'join' function will makes it a string again.
words = word_tokenize(''.join(placeHolderList))

print(words)  #Checking the full list

cleanedCorpus = []

#This loop takes out all the stopwords. I didn't add any additional stopwords.
for w in words:
Exemple #22
0
class SpellCheck:
    def __init__(self, progress, directory, countries_dict):
        self.progress = progress
        self.logger = logging.getLogger(__name__)
        self.spelling_update = Counter()
        self.directory = directory
        self.spell_path = os.path.join(self.directory, 'spelling.pkl')
        self.countries_dict = countries_dict
        self.sym_spell = SymSpell()

    def insert(self, name, iso):
        if 'gothland cemetery' not in name and name not in noise_words:
            name_tokens = name.split(' ')
            for word in name_tokens:
                key = f'{word}'
                if len(key) > 2:
                    self.spelling_update[key] += 1

    def write(self):
        # Create blank spelling dictionary
        path = os.path.join(self.directory, 'spelling.tmp')
        fl = open(path, 'w')
        fl.write('the,1\n')
        fl.close()
        success = self.sym_spell.create_dictionary(corpus=path)
        if not success:
            self.logger.error(f"error creating spelling dictionary")

        self.logger.info('Building Spelling Dictionary')

        # Add all words from geonames into spelling dictionary
        for key in self.spelling_update:
            self.sym_spell.create_dictionary_entry(
                key=key, count=self.spelling_update[key])

        self.logger.info('Writing Spelling Dictionary')
        self.sym_spell.save_pickle(self.spell_path)

    def read(self):
        success = False
        if os.path.exists(self.spell_path):
            self.logger.info(
                f'Loading Spelling Dictionary from {self.spell_path}')
            success = self.sym_spell.load_pickle(self.spell_path)
        else:
            self.logger.error(
                f"spelling dictionary not found: {self.spell_path}")

        if not success:
            self.logger.error(
                f"error loading spelling dictionary from {self.spell_path}")
        else:
            self.sym_spell.delete_dictionary_entry(key='gothland')

        size = len(self.sym_spell.words)
        self.logger.info(f"Spelling Dictionary contains {size} words")

    def lookup(self, input_term):
        #suggestions = [SymSpell.    SuggestItem]
        if '*' in input_term:
            return input_term
        res = ''
        if len(input_term) > 1:
            suggestions = self.sym_spell.lookup(input_term,
                                                Verbosity.CLOSEST,
                                                max_edit_distance=2,
                                                include_unknown=True)
            for idx, item in enumerate(suggestions):
                if idx > 3:
                    break
                #self.logger.debug(f'{item._term}')
                if item._term[0] == input_term[0]:
                    # Only accept results where first letter matches
                    res += item._term + ' '
            return res
        else:
            return input_term

    def lookup_compound(self, phrase):
        suggestions = self.sym_spell.lookup_compound(phrase=phrase,
                                                     max_edit_distance=2,
                                                     ignore_non_words=False)
        for item in suggestions:
            self.logger.debug(f'{item._term}')
        return suggestions[0]._term

    def fix_spelling(self, text):
        new_text = text
        if bool(re.search(r'\d', text)):
            # Has digits, just return text, no spellcheck
            pass
        elif 'st ' in text:
            # Spellcheck not handling St properly
            pass
        else:
            if len(text) > 0:
                new_text = self.lookup(text)
                self.logger.debug(f'Spell {text} -> {new_text}')

        return new_text.strip(' ')
misspelled=spell.unknown(injury_tokens)

max_edit_distance_dictionary = 2
prefix_length = 7
sym_spell = SymSpell(max_edit_distance_dictionary, prefix_length)

dictionary_path = pkg_resources.resource_filename("symspellpy", "frequency_dictionary_en_82_765.txt")
bigram_path = pkg_resources.resource_filename("symspellpy", "frequency_bigramdictionary_en_243_342.txt")
sym_spell.load_dictionary(dictionary_path, term_index=0, count_index=1)
sym_spell.load_bigram_dictionary(bigram_path, term_index=0, count_index=2)

max_edit_distance_lookup = 2
suggestion_verbosity = Verbosity.CLOSEST

for word in misspelled:
    spell_corrected_obj=sym_spell.lookup_compound(word, max_edit_distance_lookup)
    if len(spell_corrected_obj) > 0:
        spell_correction= spell_corrected_obj[0].term
        injury_text=regex.sub(pattern=" "+word+" ", repl=" "+spell_correction+" ", string=injury_text)

injury_tokens=tokenizer.tokenize(injury_text)
misspelled=spell.unknown(injury_tokens)

for word in misspelled:
    spell_correction = spell.correction(word)
    injury_text=regex.sub(pattern=" "+word+" ", repl=" "+spell_correction+" ", string=injury_text)
    
injury_tokens=tokenizer.tokenize(injury_text)
misspelled=spell.unknown(injury_tokens)

for word in misspelled:
Exemple #24
0
def main():

        #Load the image from the desktop
    imgFile = '/Users/emily/Desktop/basic_word2.png'

    #Read the image. Adding "0" makes this image grayscale
    img = cv2.imread(imgFile,0)

    #If you haven't given the program an image, 
    #you're going to get this error:
    if img is None:
        print("Could not read:", imgFile)

    #Now isolate the dark text from the pale background.
    #Text is now black, background is now white.
    #This way, it's easy to detect the text from the picture
    thresh = cv2.threshold(img, 0, 255, cv2.THRESH_BINARY)

    #Let's make the grayscale image bigger!!
    #Note: this makes the text detection MUCH better.
    #Please do NOT delete this line!!
    gray = cv2.resize(img, None, fx=3, fy=3, interpolation=cv2.INTER_CUBIC)

    #Add a little blur to the picture
    img = cv2.bilateralFilter(img,3,75,75)

    #Aaaand that's all, folks!
    #The image is done being processsed.
    #Save final grayscale image to a new image file
    filename="/Users/emily/Desktop/gray_image.png"
    cv2.imwrite(filename, gray)

    #Save the text from the image as a variable "text"
    #Do we need this? I seriously hope we do...
    text = pytesseract.image_to_string(Image.open(filename), lang = 'eng')

    # maximum edit distance per dictionary precalculation
    max_edit_distance_dictionary = 5
    prefix_length = 7
    # create object
    sym_spell = SymSpell(max_edit_distance_dictionary, prefix_length)
    # load dictionary
    dictionary_path = os.path.join(os.path.dirname(r"/Users/emily/Documents/Tinovation/spellcheck2.py"),
                                   "/Users/emily/Desktop/frequency_dictionary_en_82_765.txt")
    term_index = 0  # column of the term in the dictionary text file
    count_index = 1  # column of the term frequency in the dictionary text file
    if not sym_spell.load_dictionary(dictionary_path, term_index, count_index):
        print("Dictionary file not found")
        return

#     # lookup suggestions for single-word input strings
#     input_term = "memebers"  # misspelling of "members"
#     # max edit distance per lookup
#     # (max_edit_distance_lookup <= max_edit_distance_dictionary)
#     max_edit_distance_lookup = 2
#     suggestion_verbosity = Verbosity.CLOSEST  # TOP, CLOSEST, ALL
#     suggestions = sym_spell.lookup(input_term, suggestion_verbosity,
#                                    max_edit_distance_lookup)
#     # display suggestion term, term frequency, and edit distance
#     for suggestion in suggestions:
#         print("{}, {}, {}".format(suggestion.term, suggestion.distance,
#                                   suggestion.count))

#     # lookup suggestions for multi-word input strings (supports compound
#     # splitting & merging)
    input_term = ("whereis th elove hehad dated forImuch of thepast who "
                  "couqdn'tread in sixtgrade and ins pired him")
    input_term = ("ront tshi liptop si ocol")
    input_term = text
    # max edit distance per lookup (per single word, not per whole input string)
    max_edit_distance_lookup = 2
    suggestions = sym_spell.lookup_compound(input_term,
                                            max_edit_distance_lookup)
    # display suggestion term, edit distance, and term frequency
    for suggestion in suggestions:
        print("{}, {}, {}".format(suggestion.term, suggestion.distance,
                                  suggestion.count))